mp_machdep.c revision 255726
1/*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2003, by Peter Wemm 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/amd64/amd64/mp_machdep.c 255726 2013-09-20 05:06:03Z gibbs $"); 29 30#include "opt_cpu.h" 31#include "opt_ddb.h" 32#include "opt_kstack_pages.h" 33#include "opt_sched.h" 34#include "opt_smp.h" 35 36#include <sys/param.h> 37#include <sys/systm.h> 38#include <sys/bus.h> 39#include <sys/cpuset.h> 40#ifdef GPROF 41#include <sys/gmon.h> 42#endif 43#include <sys/kernel.h> 44#include <sys/ktr.h> 45#include <sys/lock.h> 46#include <sys/malloc.h> 47#include <sys/memrange.h> 48#include <sys/mutex.h> 49#include <sys/pcpu.h> 50#include <sys/proc.h> 51#include <sys/sched.h> 52#include <sys/smp.h> 53#include <sys/sysctl.h> 54 55#include <vm/vm.h> 56#include <vm/vm_param.h> 57#include <vm/pmap.h> 58#include <vm/vm_kern.h> 59#include <vm/vm_extern.h> 60 61#include <x86/apicreg.h> 62#include <machine/clock.h> 63#include <machine/cputypes.h> 64#include <machine/cpufunc.h> 65#include <x86/mca.h> 66#include <machine/md_var.h> 67#include <machine/pcb.h> 68#include <machine/psl.h> 69#include <machine/smp.h> 70#include <machine/specialreg.h> 71#include <machine/tss.h> 72#include <machine/cpu.h> 73 74#ifdef XENHVM 75#include <xen/hvm.h> 76#endif 77 78#define WARMBOOT_TARGET 0 79#define WARMBOOT_OFF (KERNBASE + 0x0467) 80#define WARMBOOT_SEG (KERNBASE + 0x0469) 81 82#define CMOS_REG (0x70) 83#define CMOS_DATA (0x71) 84#define BIOS_RESET (0x0f) 85#define BIOS_WARM (0x0a) 86 87/* lock region used by kernel profiling */ 88int mcount_lock; 89 90int mp_naps; /* # of Applications processors */ 91int boot_cpu_id = -1; /* designated BSP */ 92 93extern struct pcpu __pcpu[]; 94 95/* AP uses this during bootstrap. Do not staticize. */ 96char *bootSTK; 97static int bootAP; 98 99/* Free these after use */ 100void *bootstacks[MAXCPU]; 101 102/* Temporary variables for init_secondary() */ 103char *doublefault_stack; 104char *nmi_stack; 105void *dpcpu; 106 107struct pcb stoppcbs[MAXCPU]; 108struct pcb **susppcbs; 109 110/* Variables needed for SMP tlb shootdown. */ 111vm_offset_t smp_tlb_addr2; 112struct invpcid_descr smp_tlb_invpcid; 113volatile int smp_tlb_wait; 114uint64_t pcid_cr3; 115pmap_t smp_tlb_pmap; 116 117#ifdef COUNT_IPIS 118/* Interrupt counts. */ 119static u_long *ipi_preempt_counts[MAXCPU]; 120static u_long *ipi_ast_counts[MAXCPU]; 121u_long *ipi_invltlb_counts[MAXCPU]; 122u_long *ipi_invlrng_counts[MAXCPU]; 123u_long *ipi_invlpg_counts[MAXCPU]; 124u_long *ipi_invlcache_counts[MAXCPU]; 125u_long *ipi_rendezvous_counts[MAXCPU]; 126static u_long *ipi_hardclock_counts[MAXCPU]; 127#endif 128 129/* Default cpu_ops implementation. */ 130struct cpu_ops cpu_ops = { 131 .ipi_vectored = lapic_ipi_vectored 132}; 133 134extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32); 135 136extern int pmap_pcid_enabled; 137 138/* 139 * Local data and functions. 140 */ 141 142static volatile cpuset_t ipi_nmi_pending; 143 144/* used to hold the AP's until we are ready to release them */ 145static struct mtx ap_boot_mtx; 146 147/* Set to 1 once we're ready to let the APs out of the pen. */ 148static volatile int aps_ready = 0; 149 150/* 151 * Store data from cpu_add() until later in the boot when we actually setup 152 * the APs. 153 */ 154struct cpu_info { 155 int cpu_present:1; 156 int cpu_bsp:1; 157 int cpu_disabled:1; 158 int cpu_hyperthread:1; 159} static cpu_info[MAX_APIC_ID + 1]; 160int cpu_apic_ids[MAXCPU]; 161int apic_cpuids[MAX_APIC_ID + 1]; 162 163/* Holds pending bitmap based IPIs per CPU */ 164static volatile u_int cpu_ipi_pending[MAXCPU]; 165 166static u_int boot_address; 167static int cpu_logical; /* logical cpus per core */ 168static int cpu_cores; /* cores per package */ 169 170static void assign_cpu_ids(void); 171static void set_interrupt_apic_ids(void); 172static int start_all_aps(void); 173static int start_ap(int apic_id); 174static void release_aps(void *dummy); 175 176static u_int hyperthreading_cpus; /* logical cpus sharing L1 cache */ 177static int hyperthreading_allowed = 1; 178static u_int bootMP_size; 179 180static void 181mem_range_AP_init(void) 182{ 183 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 184 mem_range_softc.mr_op->initAP(&mem_range_softc); 185} 186 187static void 188topo_probe_amd(void) 189{ 190 int core_id_bits; 191 int id; 192 193 /* AMD processors do not support HTT. */ 194 cpu_logical = 1; 195 196 if ((amd_feature2 & AMDID2_CMP) == 0) { 197 cpu_cores = 1; 198 return; 199 } 200 201 core_id_bits = (cpu_procinfo2 & AMDID_COREID_SIZE) >> 202 AMDID_COREID_SIZE_SHIFT; 203 if (core_id_bits == 0) { 204 cpu_cores = (cpu_procinfo2 & AMDID_CMP_CORES) + 1; 205 return; 206 } 207 208 /* Fam 10h and newer should get here. */ 209 for (id = 0; id <= MAX_APIC_ID; id++) { 210 /* Check logical CPU availability. */ 211 if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) 212 continue; 213 /* Check if logical CPU has the same package ID. */ 214 if ((id >> core_id_bits) != (boot_cpu_id >> core_id_bits)) 215 continue; 216 cpu_cores++; 217 } 218} 219 220/* 221 * Round up to the next power of two, if necessary, and then 222 * take log2. 223 * Returns -1 if argument is zero. 224 */ 225static __inline int 226mask_width(u_int x) 227{ 228 229 return (fls(x << (1 - powerof2(x))) - 1); 230} 231 232static void 233topo_probe_0x4(void) 234{ 235 u_int p[4]; 236 int pkg_id_bits; 237 int core_id_bits; 238 int max_cores; 239 int max_logical; 240 int id; 241 242 /* Both zero and one here mean one logical processor per package. */ 243 max_logical = (cpu_feature & CPUID_HTT) != 0 ? 244 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; 245 if (max_logical <= 1) 246 return; 247 248 /* 249 * Because of uniformity assumption we examine only 250 * those logical processors that belong to the same 251 * package as BSP. Further, we count number of 252 * logical processors that belong to the same core 253 * as BSP thus deducing number of threads per core. 254 */ 255 if (cpu_high >= 0x4) { 256 cpuid_count(0x04, 0, p); 257 max_cores = ((p[0] >> 26) & 0x3f) + 1; 258 } else 259 max_cores = 1; 260 core_id_bits = mask_width(max_logical/max_cores); 261 if (core_id_bits < 0) 262 return; 263 pkg_id_bits = core_id_bits + mask_width(max_cores); 264 265 for (id = 0; id <= MAX_APIC_ID; id++) { 266 /* Check logical CPU availability. */ 267 if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) 268 continue; 269 /* Check if logical CPU has the same package ID. */ 270 if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits)) 271 continue; 272 cpu_cores++; 273 /* Check if logical CPU has the same package and core IDs. */ 274 if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits)) 275 cpu_logical++; 276 } 277 278 KASSERT(cpu_cores >= 1 && cpu_logical >= 1, 279 ("topo_probe_0x4 couldn't find BSP")); 280 281 cpu_cores /= cpu_logical; 282 hyperthreading_cpus = cpu_logical; 283} 284 285static void 286topo_probe_0xb(void) 287{ 288 u_int p[4]; 289 int bits; 290 int cnt; 291 int i; 292 int logical; 293 int type; 294 int x; 295 296 /* We only support three levels for now. */ 297 for (i = 0; i < 3; i++) { 298 cpuid_count(0x0b, i, p); 299 300 /* Fall back if CPU leaf 11 doesn't really exist. */ 301 if (i == 0 && p[1] == 0) { 302 topo_probe_0x4(); 303 return; 304 } 305 306 bits = p[0] & 0x1f; 307 logical = p[1] &= 0xffff; 308 type = (p[2] >> 8) & 0xff; 309 if (type == 0 || logical == 0) 310 break; 311 /* 312 * Because of uniformity assumption we examine only 313 * those logical processors that belong to the same 314 * package as BSP. 315 */ 316 for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) { 317 if (!cpu_info[x].cpu_present || 318 cpu_info[x].cpu_disabled) 319 continue; 320 if (x >> bits == boot_cpu_id >> bits) 321 cnt++; 322 } 323 if (type == CPUID_TYPE_SMT) 324 cpu_logical = cnt; 325 else if (type == CPUID_TYPE_CORE) 326 cpu_cores = cnt; 327 } 328 if (cpu_logical == 0) 329 cpu_logical = 1; 330 cpu_cores /= cpu_logical; 331} 332 333/* 334 * Both topology discovery code and code that consumes topology 335 * information assume top-down uniformity of the topology. 336 * That is, all physical packages must be identical and each 337 * core in a package must have the same number of threads. 338 * Topology information is queried only on BSP, on which this 339 * code runs and for which it can query CPUID information. 340 * Then topology is extrapolated on all packages using the 341 * uniformity assumption. 342 */ 343static void 344topo_probe(void) 345{ 346 static int cpu_topo_probed = 0; 347 348 if (cpu_topo_probed) 349 return; 350 351 CPU_ZERO(&logical_cpus_mask); 352 if (mp_ncpus <= 1) 353 cpu_cores = cpu_logical = 1; 354 else if (cpu_vendor_id == CPU_VENDOR_AMD) 355 topo_probe_amd(); 356 else if (cpu_vendor_id == CPU_VENDOR_INTEL) { 357 /* 358 * See Intel(R) 64 Architecture Processor 359 * Topology Enumeration article for details. 360 * 361 * Note that 0x1 <= cpu_high < 4 case should be 362 * compatible with topo_probe_0x4() logic when 363 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) 364 * or it should trigger the fallback otherwise. 365 */ 366 if (cpu_high >= 0xb) 367 topo_probe_0xb(); 368 else if (cpu_high >= 0x1) 369 topo_probe_0x4(); 370 } 371 372 /* 373 * Fallback: assume each logical CPU is in separate 374 * physical package. That is, no multi-core, no SMT. 375 */ 376 if (cpu_cores == 0 || cpu_logical == 0) 377 cpu_cores = cpu_logical = 1; 378 cpu_topo_probed = 1; 379} 380 381struct cpu_group * 382cpu_topo(void) 383{ 384 int cg_flags; 385 386 /* 387 * Determine whether any threading flags are 388 * necessry. 389 */ 390 topo_probe(); 391 if (cpu_logical > 1 && hyperthreading_cpus) 392 cg_flags = CG_FLAG_HTT; 393 else if (cpu_logical > 1) 394 cg_flags = CG_FLAG_SMT; 395 else 396 cg_flags = 0; 397 if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { 398 printf("WARNING: Non-uniform processors.\n"); 399 printf("WARNING: Using suboptimal topology.\n"); 400 return (smp_topo_none()); 401 } 402 /* 403 * No multi-core or hyper-threaded. 404 */ 405 if (cpu_logical * cpu_cores == 1) 406 return (smp_topo_none()); 407 /* 408 * Only HTT no multi-core. 409 */ 410 if (cpu_logical > 1 && cpu_cores == 1) 411 return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags)); 412 /* 413 * Only multi-core no HTT. 414 */ 415 if (cpu_cores > 1 && cpu_logical == 1) 416 return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags)); 417 /* 418 * Both HTT and multi-core. 419 */ 420 return (smp_topo_2level(CG_SHARE_L2, cpu_cores, 421 CG_SHARE_L1, cpu_logical, cg_flags)); 422} 423 424/* 425 * Calculate usable address in base memory for AP trampoline code. 426 */ 427u_int 428mp_bootaddress(u_int basemem) 429{ 430 431 bootMP_size = mptramp_end - mptramp_start; 432 boot_address = trunc_page(basemem * 1024); /* round down to 4k boundary */ 433 if (((basemem * 1024) - boot_address) < bootMP_size) 434 boot_address -= PAGE_SIZE; /* not enough, lower by 4k */ 435 /* 3 levels of page table pages */ 436 mptramp_pagetables = boot_address - (PAGE_SIZE * 3); 437 438 return mptramp_pagetables; 439} 440 441void 442cpu_add(u_int apic_id, char boot_cpu) 443{ 444 445 if (apic_id > MAX_APIC_ID) { 446 panic("SMP: APIC ID %d too high", apic_id); 447 return; 448 } 449 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", 450 apic_id)); 451 cpu_info[apic_id].cpu_present = 1; 452 if (boot_cpu) { 453 KASSERT(boot_cpu_id == -1, 454 ("CPU %d claims to be BSP, but CPU %d already is", apic_id, 455 boot_cpu_id)); 456 boot_cpu_id = apic_id; 457 cpu_info[apic_id].cpu_bsp = 1; 458 } 459 if (mp_ncpus < MAXCPU) { 460 mp_ncpus++; 461 mp_maxid = mp_ncpus - 1; 462 } 463 if (bootverbose) 464 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : 465 "AP"); 466} 467 468void 469cpu_mp_setmaxid(void) 470{ 471 472 /* 473 * mp_maxid should be already set by calls to cpu_add(). 474 * Just sanity check its value here. 475 */ 476 if (mp_ncpus == 0) 477 KASSERT(mp_maxid == 0, 478 ("%s: mp_ncpus is zero, but mp_maxid is not", __func__)); 479 else if (mp_ncpus == 1) 480 mp_maxid = 0; 481 else 482 KASSERT(mp_maxid >= mp_ncpus - 1, 483 ("%s: counters out of sync: max %d, count %d", __func__, 484 mp_maxid, mp_ncpus)); 485} 486 487int 488cpu_mp_probe(void) 489{ 490 491 /* 492 * Always record BSP in CPU map so that the mbuf init code works 493 * correctly. 494 */ 495 CPU_SETOF(0, &all_cpus); 496 if (mp_ncpus == 0) { 497 /* 498 * No CPUs were found, so this must be a UP system. Setup 499 * the variables to represent a system with a single CPU 500 * with an id of 0. 501 */ 502 mp_ncpus = 1; 503 return (0); 504 } 505 506 /* At least one CPU was found. */ 507 if (mp_ncpus == 1) { 508 /* 509 * One CPU was found, so this must be a UP system with 510 * an I/O APIC. 511 */ 512 mp_maxid = 0; 513 return (0); 514 } 515 516 /* At least two CPUs were found. */ 517 return (1); 518} 519 520/* 521 * Initialize the IPI handlers and start up the AP's. 522 */ 523void 524cpu_mp_start(void) 525{ 526 int i; 527 528 /* Initialize the logical ID to APIC ID table. */ 529 for (i = 0; i < MAXCPU; i++) { 530 cpu_apic_ids[i] = -1; 531 cpu_ipi_pending[i] = 0; 532 } 533 534 /* Install an inter-CPU IPI for TLB invalidation */ 535 if (pmap_pcid_enabled) { 536 setidt(IPI_INVLTLB, IDTVEC(invltlb_pcid), SDT_SYSIGT, 537 SEL_KPL, 0); 538 setidt(IPI_INVLPG, IDTVEC(invlpg_pcid), SDT_SYSIGT, 539 SEL_KPL, 0); 540 } else { 541 setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0); 542 setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0); 543 } 544 setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0); 545 546 /* Install an inter-CPU IPI for cache invalidation. */ 547 setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYSIGT, SEL_KPL, 0); 548 549 /* Install an inter-CPU IPI for all-CPU rendezvous */ 550 setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0); 551 552 /* Install generic inter-CPU IPI handler */ 553 setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler), 554 SDT_SYSIGT, SEL_KPL, 0); 555 556 /* Install an inter-CPU IPI for CPU stop/restart */ 557 setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0); 558 559 /* Install an inter-CPU IPI for CPU suspend/resume */ 560 setidt(IPI_SUSPEND, IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0); 561 562 /* Set boot_cpu_id if needed. */ 563 if (boot_cpu_id == -1) { 564 boot_cpu_id = PCPU_GET(apic_id); 565 cpu_info[boot_cpu_id].cpu_bsp = 1; 566 } else 567 KASSERT(boot_cpu_id == PCPU_GET(apic_id), 568 ("BSP's APIC ID doesn't match boot_cpu_id")); 569 570 /* Probe logical/physical core configuration. */ 571 topo_probe(); 572 573 assign_cpu_ids(); 574 575 /* Start each Application Processor */ 576 start_all_aps(); 577 578 set_interrupt_apic_ids(); 579} 580 581 582/* 583 * Print various information about the SMP system hardware and setup. 584 */ 585void 586cpu_mp_announce(void) 587{ 588 const char *hyperthread; 589 int i; 590 591 printf("FreeBSD/SMP: %d package(s) x %d core(s)", 592 mp_ncpus / (cpu_cores * cpu_logical), cpu_cores); 593 if (hyperthreading_cpus > 1) 594 printf(" x %d HTT threads", cpu_logical); 595 else if (cpu_logical > 1) 596 printf(" x %d SMT threads", cpu_logical); 597 printf("\n"); 598 599 /* List active CPUs first. */ 600 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); 601 for (i = 1; i < mp_ncpus; i++) { 602 if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread) 603 hyperthread = "/HT"; 604 else 605 hyperthread = ""; 606 printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread, 607 cpu_apic_ids[i]); 608 } 609 610 /* List disabled CPUs last. */ 611 for (i = 0; i <= MAX_APIC_ID; i++) { 612 if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled) 613 continue; 614 if (cpu_info[i].cpu_hyperthread) 615 hyperthread = "/HT"; 616 else 617 hyperthread = ""; 618 printf(" cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread, 619 i); 620 } 621} 622 623/* 624 * AP CPU's call this to initialize themselves. 625 */ 626void 627init_secondary(void) 628{ 629 struct pcpu *pc; 630 struct nmi_pcpu *np; 631 u_int64_t msr, cr0; 632 u_int cpuid; 633 int cpu, gsel_tss, x; 634 struct region_descriptor ap_gdt; 635 636 /* Set by the startup code for us to use */ 637 cpu = bootAP; 638 639 /* Init tss */ 640 common_tss[cpu] = common_tss[0]; 641 common_tss[cpu].tss_rsp0 = 0; /* not used until after switch */ 642 common_tss[cpu].tss_iobase = sizeof(struct amd64tss) + 643 IOPAGES * PAGE_SIZE; 644 common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE]; 645 646 /* The NMI stack runs on IST2. */ 647 np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1; 648 common_tss[cpu].tss_ist2 = (long) np; 649 650 /* Prepare private GDT */ 651 gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu]; 652 for (x = 0; x < NGDT; x++) { 653 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && 654 x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1)) 655 ssdtosd(&gdt_segs[x], &gdt[NGDT * cpu + x]); 656 } 657 ssdtosyssd(&gdt_segs[GPROC0_SEL], 658 (struct system_segment_descriptor *)&gdt[NGDT * cpu + GPROC0_SEL]); 659 ap_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 660 ap_gdt.rd_base = (long) &gdt[NGDT * cpu]; 661 lgdt(&ap_gdt); /* does magic intra-segment return */ 662 663 /* Get per-cpu data */ 664 pc = &__pcpu[cpu]; 665 666 /* prime data page for it to use */ 667 pcpu_init(pc, cpu, sizeof(struct pcpu)); 668 dpcpu_init(dpcpu, cpu); 669 pc->pc_apic_id = cpu_apic_ids[cpu]; 670 pc->pc_prvspace = pc; 671 pc->pc_curthread = 0; 672 pc->pc_tssp = &common_tss[cpu]; 673 pc->pc_commontssp = &common_tss[cpu]; 674 pc->pc_rsp0 = 0; 675 pc->pc_tss = (struct system_segment_descriptor *)&gdt[NGDT * cpu + 676 GPROC0_SEL]; 677 pc->pc_fs32p = &gdt[NGDT * cpu + GUFS32_SEL]; 678 pc->pc_gs32p = &gdt[NGDT * cpu + GUGS32_SEL]; 679 pc->pc_ldt = (struct system_segment_descriptor *)&gdt[NGDT * cpu + 680 GUSERLDT_SEL]; 681 682 /* Save the per-cpu pointer for use by the NMI handler. */ 683 np->np_pcpu = (register_t) pc; 684 685 wrmsr(MSR_FSBASE, 0); /* User value */ 686 wrmsr(MSR_GSBASE, (u_int64_t)pc); 687 wrmsr(MSR_KGSBASE, (u_int64_t)pc); /* XXX User value while we're in the kernel */ 688 689 lidt(&r_idt); 690 691 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 692 ltr(gsel_tss); 693 694 /* 695 * Set to a known state: 696 * Set by mpboot.s: CR0_PG, CR0_PE 697 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM 698 */ 699 cr0 = rcr0(); 700 cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); 701 load_cr0(cr0); 702 703 /* Set up the fast syscall stuff */ 704 msr = rdmsr(MSR_EFER) | EFER_SCE; 705 wrmsr(MSR_EFER, msr); 706 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); 707 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 708 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 709 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 710 wrmsr(MSR_STAR, msr); 711 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); 712 713 /* Disable local APIC just to be sure. */ 714 lapic_disable(); 715 716 /* signal our startup to the BSP. */ 717 mp_naps++; 718 719 /* Spin until the BSP releases the AP's. */ 720 while (!aps_ready) 721 ia32_pause(); 722 723 /* Initialize the PAT MSR. */ 724 pmap_init_pat(); 725 726 /* set up CPU registers and state */ 727 cpu_setregs(); 728 729 /* set up SSE/NX registers */ 730 initializecpu(); 731 732 /* set up FPU state on the AP */ 733 fpuinit(); 734 735#ifdef XENHVM 736 /* register vcpu_info area */ 737 xen_hvm_init_cpu(); 738#endif 739 740 /* A quick check from sanity claus */ 741 cpuid = PCPU_GET(cpuid); 742 if (PCPU_GET(apic_id) != lapic_id()) { 743 printf("SMP: cpuid = %d\n", cpuid); 744 printf("SMP: actual apic_id = %d\n", lapic_id()); 745 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 746 panic("cpuid mismatch! boom!!"); 747 } 748 749 /* Initialize curthread. */ 750 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 751 PCPU_SET(curthread, PCPU_GET(idlethread)); 752 753 mca_init(); 754 755 mtx_lock_spin(&ap_boot_mtx); 756 757 /* Init local apic for irq's */ 758 lapic_setup(1); 759 760 /* Set memory range attributes for this CPU to match the BSP */ 761 mem_range_AP_init(); 762 763 smp_cpus++; 764 765 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); 766 printf("SMP: AP CPU #%d Launched!\n", cpuid); 767 768 /* Determine if we are a logical CPU. */ 769 /* XXX Calculation depends on cpu_logical being a power of 2, e.g. 2 */ 770 if (cpu_logical > 1 && PCPU_GET(apic_id) % cpu_logical != 0) 771 CPU_SET(cpuid, &logical_cpus_mask); 772 773 if (bootverbose) 774 lapic_dump("AP"); 775 776 if (smp_cpus == mp_ncpus) { 777 /* enable IPI's, tlb shootdown, freezes etc */ 778 atomic_store_rel_int(&smp_started, 1); 779 smp_active = 1; /* historic */ 780 } 781 782 /* 783 * Enable global pages TLB extension 784 * This also implicitly flushes the TLB 785 */ 786 787 load_cr4(rcr4() | CR4_PGE); 788 if (pmap_pcid_enabled) 789 load_cr4(rcr4() | CR4_PCIDE); 790 load_ds(_udatasel); 791 load_es(_udatasel); 792 load_fs(_ufssel); 793 mtx_unlock_spin(&ap_boot_mtx); 794 795 /* Wait until all the AP's are up. */ 796 while (smp_started == 0) 797 ia32_pause(); 798 799 /* Start per-CPU event timers. */ 800 cpu_initclocks_ap(); 801 802 sched_throw(NULL); 803 804 panic("scheduler returned us to %s", __func__); 805 /* NOTREACHED */ 806} 807 808/******************************************************************* 809 * local functions and data 810 */ 811 812/* 813 * We tell the I/O APIC code about all the CPUs we want to receive 814 * interrupts. If we don't want certain CPUs to receive IRQs we 815 * can simply not tell the I/O APIC code about them in this function. 816 * We also do not tell it about the BSP since it tells itself about 817 * the BSP internally to work with UP kernels and on UP machines. 818 */ 819static void 820set_interrupt_apic_ids(void) 821{ 822 u_int i, apic_id; 823 824 for (i = 0; i < MAXCPU; i++) { 825 apic_id = cpu_apic_ids[i]; 826 if (apic_id == -1) 827 continue; 828 if (cpu_info[apic_id].cpu_bsp) 829 continue; 830 if (cpu_info[apic_id].cpu_disabled) 831 continue; 832 833 /* Don't let hyperthreads service interrupts. */ 834 if (hyperthreading_cpus > 1 && 835 apic_id % hyperthreading_cpus != 0) 836 continue; 837 838 intr_add_cpu(i); 839 } 840} 841 842/* 843 * Assign logical CPU IDs to local APICs. 844 */ 845static void 846assign_cpu_ids(void) 847{ 848 u_int i; 849 850 TUNABLE_INT_FETCH("machdep.hyperthreading_allowed", 851 &hyperthreading_allowed); 852 853 /* Check for explicitly disabled CPUs. */ 854 for (i = 0; i <= MAX_APIC_ID; i++) { 855 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) 856 continue; 857 858 if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) { 859 cpu_info[i].cpu_hyperthread = 1; 860 861 /* 862 * Don't use HT CPU if it has been disabled by a 863 * tunable. 864 */ 865 if (hyperthreading_allowed == 0) { 866 cpu_info[i].cpu_disabled = 1; 867 continue; 868 } 869 } 870 871 /* Don't use this CPU if it has been disabled by a tunable. */ 872 if (resource_disabled("lapic", i)) { 873 cpu_info[i].cpu_disabled = 1; 874 continue; 875 } 876 } 877 878 if (hyperthreading_allowed == 0 && hyperthreading_cpus > 1) { 879 hyperthreading_cpus = 0; 880 cpu_logical = 1; 881 } 882 883 /* 884 * Assign CPU IDs to local APIC IDs and disable any CPUs 885 * beyond MAXCPU. CPU 0 is always assigned to the BSP. 886 * 887 * To minimize confusion for userland, we attempt to number 888 * CPUs such that all threads and cores in a package are 889 * grouped together. For now we assume that the BSP is always 890 * the first thread in a package and just start adding APs 891 * starting with the BSP's APIC ID. 892 */ 893 mp_ncpus = 1; 894 cpu_apic_ids[0] = boot_cpu_id; 895 apic_cpuids[boot_cpu_id] = 0; 896 for (i = boot_cpu_id + 1; i != boot_cpu_id; 897 i == MAX_APIC_ID ? i = 0 : i++) { 898 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || 899 cpu_info[i].cpu_disabled) 900 continue; 901 902 if (mp_ncpus < MAXCPU) { 903 cpu_apic_ids[mp_ncpus] = i; 904 apic_cpuids[i] = mp_ncpus; 905 mp_ncpus++; 906 } else 907 cpu_info[i].cpu_disabled = 1; 908 } 909 KASSERT(mp_maxid >= mp_ncpus - 1, 910 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 911 mp_ncpus)); 912} 913 914/* 915 * start each AP in our list 916 */ 917static int 918start_all_aps(void) 919{ 920 vm_offset_t va = boot_address + KERNBASE; 921 u_int64_t *pt4, *pt3, *pt2; 922 u_int32_t mpbioswarmvec; 923 int apic_id, cpu, i; 924 u_char mpbiosreason; 925 926 mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); 927 928 /* install the AP 1st level boot code */ 929 pmap_kenter(va, boot_address); 930 pmap_invalidate_page(kernel_pmap, va); 931 bcopy(mptramp_start, (void *)va, bootMP_size); 932 933 /* Locate the page tables, they'll be below the trampoline */ 934 pt4 = (u_int64_t *)(uintptr_t)(mptramp_pagetables + KERNBASE); 935 pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t); 936 pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t); 937 938 /* Create the initial 1GB replicated page tables */ 939 for (i = 0; i < 512; i++) { 940 /* Each slot of the level 4 pages points to the same level 3 page */ 941 pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE); 942 pt4[i] |= PG_V | PG_RW | PG_U; 943 944 /* Each slot of the level 3 pages points to the same level 2 page */ 945 pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE)); 946 pt3[i] |= PG_V | PG_RW | PG_U; 947 948 /* The level 2 page slots are mapped with 2MB pages for 1GB. */ 949 pt2[i] = i * (2 * 1024 * 1024); 950 pt2[i] |= PG_V | PG_RW | PG_PS | PG_U; 951 } 952 953 /* save the current value of the warm-start vector */ 954 mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); 955 outb(CMOS_REG, BIOS_RESET); 956 mpbiosreason = inb(CMOS_DATA); 957 958 /* setup a vector to our boot code */ 959 *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; 960 *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); 961 outb(CMOS_REG, BIOS_RESET); 962 outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ 963 964 /* start each AP */ 965 for (cpu = 1; cpu < mp_ncpus; cpu++) { 966 apic_id = cpu_apic_ids[cpu]; 967 968 /* allocate and set up an idle stack data page */ 969 bootstacks[cpu] = (void *)kmem_malloc(kernel_arena, 970 KSTACK_PAGES * PAGE_SIZE, M_WAITOK | M_ZERO); 971 doublefault_stack = (char *)kmem_malloc(kernel_arena, 972 PAGE_SIZE, M_WAITOK | M_ZERO); 973 nmi_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE, 974 M_WAITOK | M_ZERO); 975 dpcpu = (void *)kmem_malloc(kernel_arena, DPCPU_SIZE, 976 M_WAITOK | M_ZERO); 977 978 bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8; 979 bootAP = cpu; 980 981 /* attempt to start the Application Processor */ 982 if (!start_ap(apic_id)) { 983 /* restore the warmstart vector */ 984 *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; 985 panic("AP #%d (PHY# %d) failed!", cpu, apic_id); 986 } 987 988 CPU_SET(cpu, &all_cpus); /* record AP in CPU map */ 989 } 990 991 /* restore the warmstart vector */ 992 *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; 993 994 outb(CMOS_REG, BIOS_RESET); 995 outb(CMOS_DATA, mpbiosreason); 996 997 /* number of APs actually started */ 998 return mp_naps; 999} 1000 1001 1002/* 1003 * This function starts the AP (application processor) identified 1004 * by the APIC ID 'physicalCpu'. It does quite a "song and dance" 1005 * to accomplish this. This is necessary because of the nuances 1006 * of the different hardware we might encounter. It isn't pretty, 1007 * but it seems to work. 1008 */ 1009static int 1010start_ap(int apic_id) 1011{ 1012 int vector, ms; 1013 int cpus; 1014 1015 /* calculate the vector */ 1016 vector = (boot_address >> 12) & 0xff; 1017 1018 /* used as a watchpoint to signal AP startup */ 1019 cpus = mp_naps; 1020 1021 ipi_startup(apic_id, vector); 1022 1023 /* Wait up to 5 seconds for it to start. */ 1024 for (ms = 0; ms < 5000; ms++) { 1025 if (mp_naps > cpus) 1026 return 1; /* return SUCCESS */ 1027 DELAY(1000); 1028 } 1029 return 0; /* return FAILURE */ 1030} 1031 1032#ifdef COUNT_XINVLTLB_HITS 1033u_int xhits_gbl[MAXCPU]; 1034u_int xhits_pg[MAXCPU]; 1035u_int xhits_rng[MAXCPU]; 1036static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); 1037SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, 1038 sizeof(xhits_gbl), "IU", ""); 1039SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, 1040 sizeof(xhits_pg), "IU", ""); 1041SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, 1042 sizeof(xhits_rng), "IU", ""); 1043 1044u_int ipi_global; 1045u_int ipi_page; 1046u_int ipi_range; 1047u_int ipi_range_size; 1048SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); 1049SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); 1050SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); 1051SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, 1052 &ipi_range_size, 0, ""); 1053 1054u_int ipi_masked_global; 1055u_int ipi_masked_page; 1056u_int ipi_masked_range; 1057u_int ipi_masked_range_size; 1058SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, 1059 &ipi_masked_global, 0, ""); 1060SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, 1061 &ipi_masked_page, 0, ""); 1062SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, 1063 &ipi_masked_range, 0, ""); 1064SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, 1065 &ipi_masked_range_size, 0, ""); 1066#endif /* COUNT_XINVLTLB_HITS */ 1067 1068/* 1069 * Init and startup IPI. 1070 */ 1071void 1072ipi_startup(int apic_id, int vector) 1073{ 1074 1075 /* 1076 * first we do an INIT IPI: this INIT IPI might be run, resetting 1077 * and running the target CPU. OR this INIT IPI might be latched (P5 1078 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 1079 * ignored. 1080 */ 1081 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1082 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 1083 lapic_ipi_wait(-1); 1084 DELAY(10000); /* wait ~10mS */ 1085 1086 /* 1087 * next we do a STARTUP IPI: the previous INIT IPI might still be 1088 * latched, (P5 bug) this 1st STARTUP would then terminate 1089 * immediately, and the previously started INIT IPI would continue. OR 1090 * the previous INIT IPI has already run. and this STARTUP IPI will 1091 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 1092 * will run. 1093 */ 1094 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1095 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1096 vector, apic_id); 1097 lapic_ipi_wait(-1); 1098 DELAY(200); /* wait ~200uS */ 1099 1100 /* 1101 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 1102 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 1103 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 1104 * recognized after hardware RESET or INIT IPI. 1105 */ 1106 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1107 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1108 vector, apic_id); 1109 lapic_ipi_wait(-1); 1110 DELAY(200); /* wait ~200uS */ 1111} 1112 1113/* 1114 * Send an IPI to specified CPU handling the bitmap logic. 1115 */ 1116static void 1117ipi_send_cpu(int cpu, u_int ipi) 1118{ 1119 u_int bitmap, old_pending, new_pending; 1120 1121 KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); 1122 1123 if (IPI_IS_BITMAPED(ipi)) { 1124 bitmap = 1 << ipi; 1125 ipi = IPI_BITMAP_VECTOR; 1126 do { 1127 old_pending = cpu_ipi_pending[cpu]; 1128 new_pending = old_pending | bitmap; 1129 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], 1130 old_pending, new_pending)); 1131 if (old_pending) 1132 return; 1133 } 1134 cpu_ops.ipi_vectored(ipi, cpu_apic_ids[cpu]); 1135} 1136 1137/* 1138 * Flush the TLB on all other CPU's 1139 */ 1140static void 1141smp_tlb_shootdown(u_int vector, pmap_t pmap, vm_offset_t addr1, 1142 vm_offset_t addr2) 1143{ 1144 u_int ncpu; 1145 1146 ncpu = mp_ncpus - 1; /* does not shootdown self */ 1147 if (ncpu < 1) 1148 return; /* no other cpus */ 1149 if (!(read_rflags() & PSL_I)) 1150 panic("%s: interrupts disabled", __func__); 1151 mtx_lock_spin(&smp_ipi_mtx); 1152 smp_tlb_invpcid.addr = addr1; 1153 if (pmap == NULL) { 1154 smp_tlb_invpcid.pcid = 0; 1155 } else { 1156 smp_tlb_invpcid.pcid = pmap->pm_pcid; 1157 pcid_cr3 = pmap->pm_cr3; 1158 } 1159 smp_tlb_addr2 = addr2; 1160 smp_tlb_pmap = pmap; 1161 atomic_store_rel_int(&smp_tlb_wait, 0); 1162 ipi_all_but_self(vector); 1163 while (smp_tlb_wait < ncpu) 1164 ia32_pause(); 1165 mtx_unlock_spin(&smp_ipi_mtx); 1166} 1167 1168static void 1169smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, 1170 vm_offset_t addr1, vm_offset_t addr2) 1171{ 1172 int cpu, ncpu, othercpus; 1173 1174 othercpus = mp_ncpus - 1; 1175 if (CPU_ISFULLSET(&mask)) { 1176 if (othercpus < 1) 1177 return; 1178 } else { 1179 CPU_CLR(PCPU_GET(cpuid), &mask); 1180 if (CPU_EMPTY(&mask)) 1181 return; 1182 } 1183 if (!(read_rflags() & PSL_I)) 1184 panic("%s: interrupts disabled", __func__); 1185 mtx_lock_spin(&smp_ipi_mtx); 1186 smp_tlb_invpcid.addr = addr1; 1187 if (pmap == NULL) { 1188 smp_tlb_invpcid.pcid = 0; 1189 } else { 1190 smp_tlb_invpcid.pcid = pmap->pm_pcid; 1191 pcid_cr3 = pmap->pm_cr3; 1192 } 1193 smp_tlb_addr2 = addr2; 1194 smp_tlb_pmap = pmap; 1195 atomic_store_rel_int(&smp_tlb_wait, 0); 1196 if (CPU_ISFULLSET(&mask)) { 1197 ncpu = othercpus; 1198 ipi_all_but_self(vector); 1199 } else { 1200 ncpu = 0; 1201 while ((cpu = CPU_FFS(&mask)) != 0) { 1202 cpu--; 1203 CPU_CLR(cpu, &mask); 1204 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, 1205 cpu, vector); 1206 ipi_send_cpu(cpu, vector); 1207 ncpu++; 1208 } 1209 } 1210 while (smp_tlb_wait < ncpu) 1211 ia32_pause(); 1212 mtx_unlock_spin(&smp_ipi_mtx); 1213} 1214 1215void 1216smp_cache_flush(void) 1217{ 1218 1219 if (smp_started) 1220 smp_tlb_shootdown(IPI_INVLCACHE, NULL, 0, 0); 1221} 1222 1223void 1224smp_invltlb(pmap_t pmap) 1225{ 1226 1227 if (smp_started) { 1228 smp_tlb_shootdown(IPI_INVLTLB, pmap, 0, 0); 1229#ifdef COUNT_XINVLTLB_HITS 1230 ipi_global++; 1231#endif 1232 } 1233} 1234 1235void 1236smp_invlpg(pmap_t pmap, vm_offset_t addr) 1237{ 1238 1239 if (smp_started) { 1240 smp_tlb_shootdown(IPI_INVLPG, pmap, addr, 0); 1241#ifdef COUNT_XINVLTLB_HITS 1242 ipi_page++; 1243#endif 1244 } 1245} 1246 1247void 1248smp_invlpg_range(pmap_t pmap, vm_offset_t addr1, vm_offset_t addr2) 1249{ 1250 1251 if (smp_started) { 1252 smp_tlb_shootdown(IPI_INVLRNG, pmap, addr1, addr2); 1253#ifdef COUNT_XINVLTLB_HITS 1254 ipi_range++; 1255 ipi_range_size += (addr2 - addr1) / PAGE_SIZE; 1256#endif 1257 } 1258} 1259 1260void 1261smp_masked_invltlb(cpuset_t mask, pmap_t pmap) 1262{ 1263 1264 if (smp_started) { 1265 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, NULL, 0, 0); 1266#ifdef COUNT_XINVLTLB_HITS 1267 ipi_masked_global++; 1268#endif 1269 } 1270} 1271 1272void 1273smp_masked_invlpg(cpuset_t mask, pmap_t pmap, vm_offset_t addr) 1274{ 1275 1276 if (smp_started) { 1277 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0); 1278#ifdef COUNT_XINVLTLB_HITS 1279 ipi_masked_page++; 1280#endif 1281 } 1282} 1283 1284void 1285smp_masked_invlpg_range(cpuset_t mask, pmap_t pmap, vm_offset_t addr1, 1286 vm_offset_t addr2) 1287{ 1288 1289 if (smp_started) { 1290 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, addr1, 1291 addr2); 1292#ifdef COUNT_XINVLTLB_HITS 1293 ipi_masked_range++; 1294 ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; 1295#endif 1296 } 1297} 1298 1299void 1300ipi_bitmap_handler(struct trapframe frame) 1301{ 1302 struct trapframe *oldframe; 1303 struct thread *td; 1304 int cpu = PCPU_GET(cpuid); 1305 u_int ipi_bitmap; 1306 1307 critical_enter(); 1308 td = curthread; 1309 td->td_intr_nesting_level++; 1310 oldframe = td->td_intr_frame; 1311 td->td_intr_frame = &frame; 1312 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 1313 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 1314#ifdef COUNT_IPIS 1315 (*ipi_preempt_counts[cpu])++; 1316#endif 1317 sched_preempt(td); 1318 } 1319 if (ipi_bitmap & (1 << IPI_AST)) { 1320#ifdef COUNT_IPIS 1321 (*ipi_ast_counts[cpu])++; 1322#endif 1323 /* Nothing to do for AST */ 1324 } 1325 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { 1326#ifdef COUNT_IPIS 1327 (*ipi_hardclock_counts[cpu])++; 1328#endif 1329 hardclockintr(); 1330 } 1331 td->td_intr_frame = oldframe; 1332 td->td_intr_nesting_level--; 1333 critical_exit(); 1334} 1335 1336/* 1337 * send an IPI to a set of cpus. 1338 */ 1339void 1340ipi_selected(cpuset_t cpus, u_int ipi) 1341{ 1342 int cpu; 1343 1344 /* 1345 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1346 * of help in order to understand what is the source. 1347 * Set the mask of receiving CPUs for this purpose. 1348 */ 1349 if (ipi == IPI_STOP_HARD) 1350 CPU_OR_ATOMIC(&ipi_nmi_pending, &cpus); 1351 1352 while ((cpu = CPU_FFS(&cpus)) != 0) { 1353 cpu--; 1354 CPU_CLR(cpu, &cpus); 1355 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1356 ipi_send_cpu(cpu, ipi); 1357 } 1358} 1359 1360/* 1361 * send an IPI to a specific CPU. 1362 */ 1363void 1364ipi_cpu(int cpu, u_int ipi) 1365{ 1366 1367 /* 1368 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1369 * of help in order to understand what is the source. 1370 * Set the mask of receiving CPUs for this purpose. 1371 */ 1372 if (ipi == IPI_STOP_HARD) 1373 CPU_SET_ATOMIC(cpu, &ipi_nmi_pending); 1374 1375 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1376 ipi_send_cpu(cpu, ipi); 1377} 1378 1379/* 1380 * send an IPI to all CPUs EXCEPT myself 1381 */ 1382void 1383ipi_all_but_self(u_int ipi) 1384{ 1385 cpuset_t other_cpus; 1386 1387 other_cpus = all_cpus; 1388 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1389 1390 if (IPI_IS_BITMAPED(ipi)) { 1391 ipi_selected(other_cpus, ipi); 1392 return; 1393 } 1394 1395 /* 1396 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1397 * of help in order to understand what is the source. 1398 * Set the mask of receiving CPUs for this purpose. 1399 */ 1400 if (ipi == IPI_STOP_HARD) 1401 CPU_OR_ATOMIC(&ipi_nmi_pending, &other_cpus); 1402 1403 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1404 cpu_ops.ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 1405} 1406 1407int 1408ipi_nmi_handler() 1409{ 1410 u_int cpuid; 1411 1412 /* 1413 * As long as there is not a simple way to know about a NMI's 1414 * source, if the bitmask for the current CPU is present in 1415 * the global pending bitword an IPI_STOP_HARD has been issued 1416 * and should be handled. 1417 */ 1418 cpuid = PCPU_GET(cpuid); 1419 if (!CPU_ISSET(cpuid, &ipi_nmi_pending)) 1420 return (1); 1421 1422 CPU_CLR_ATOMIC(cpuid, &ipi_nmi_pending); 1423 cpustop_handler(); 1424 return (0); 1425} 1426 1427/* 1428 * Handle an IPI_STOP by saving our current context and spinning until we 1429 * are resumed. 1430 */ 1431void 1432cpustop_handler(void) 1433{ 1434 u_int cpu; 1435 1436 cpu = PCPU_GET(cpuid); 1437 1438 savectx(&stoppcbs[cpu]); 1439 1440 /* Indicate that we are stopped */ 1441 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1442 1443 /* Wait for restart */ 1444 while (!CPU_ISSET(cpu, &started_cpus)) 1445 ia32_pause(); 1446 1447 CPU_CLR_ATOMIC(cpu, &started_cpus); 1448 CPU_CLR_ATOMIC(cpu, &stopped_cpus); 1449 1450#ifdef DDB 1451 amd64_db_resume_dbreg(); 1452#endif 1453 1454 if (cpu == 0 && cpustop_restartfunc != NULL) { 1455 cpustop_restartfunc(); 1456 cpustop_restartfunc = NULL; 1457 } 1458} 1459 1460/* 1461 * Handle an IPI_SUSPEND by saving our current context and spinning until we 1462 * are resumed. 1463 */ 1464void 1465cpususpend_handler(void) 1466{ 1467 u_int cpu; 1468 1469 cpu = PCPU_GET(cpuid); 1470 1471#ifdef XENHVM 1472 mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); 1473#endif 1474 1475 if (savectx(susppcbs[cpu])) { 1476 ctx_fpusave(susppcbs[cpu]->pcb_fpususpend); 1477 wbinvd(); 1478 CPU_SET_ATOMIC(cpu, &suspended_cpus); 1479 } else { 1480 pmap_init_pat(); 1481 initializecpu(); 1482 PCPU_SET(switchtime, 0); 1483 PCPU_SET(switchticks, ticks); 1484 1485 /* Indicate that we are resumed */ 1486 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1487 } 1488 1489 /* Wait for resume */ 1490 while (!CPU_ISSET(cpu, &started_cpus)) 1491 ia32_pause(); 1492 1493#ifdef XENHVM 1494 /* 1495 * Reset pending bitmap IPIs, because Xen doesn't preserve pending 1496 * event channels on migration. 1497 */ 1498 cpu_ipi_pending[cpu] = 0; 1499 /* register vcpu_info area */ 1500 xen_hvm_init_cpu(); 1501#endif 1502 1503 /* Resume MCA and local APIC */ 1504 mca_resume(); 1505 lapic_setup(0); 1506 1507 CPU_CLR_ATOMIC(cpu, &started_cpus); 1508 /* Indicate that we are resumed */ 1509 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1510} 1511 1512/* 1513 * This is called once the rest of the system is up and running and we're 1514 * ready to let the AP's out of the pen. 1515 */ 1516static void 1517release_aps(void *dummy __unused) 1518{ 1519 1520 if (mp_ncpus == 1) 1521 return; 1522 atomic_store_rel_int(&aps_ready, 1); 1523 while (smp_started == 0) 1524 ia32_pause(); 1525} 1526SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1527 1528#ifdef COUNT_IPIS 1529/* 1530 * Setup interrupt counters for IPI handlers. 1531 */ 1532static void 1533mp_ipi_intrcnt(void *dummy) 1534{ 1535 char buf[64]; 1536 int i; 1537 1538 CPU_FOREACH(i) { 1539 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); 1540 intrcnt_add(buf, &ipi_invltlb_counts[i]); 1541 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); 1542 intrcnt_add(buf, &ipi_invlrng_counts[i]); 1543 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); 1544 intrcnt_add(buf, &ipi_invlpg_counts[i]); 1545 snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); 1546 intrcnt_add(buf, &ipi_invlcache_counts[i]); 1547 snprintf(buf, sizeof(buf), "cpu%d:preempt", i); 1548 intrcnt_add(buf, &ipi_preempt_counts[i]); 1549 snprintf(buf, sizeof(buf), "cpu%d:ast", i); 1550 intrcnt_add(buf, &ipi_ast_counts[i]); 1551 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); 1552 intrcnt_add(buf, &ipi_rendezvous_counts[i]); 1553 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); 1554 intrcnt_add(buf, &ipi_hardclock_counts[i]); 1555 } 1556} 1557SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); 1558#endif 1559 1560