mp_x86.c revision 291688
1/*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2003, by Peter Wemm 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/x86/x86/mp_x86.c 291688 2015-12-03 11:14:14Z kib $"); 29 30#ifdef __i386__ 31#include "opt_apic.h" 32#endif 33#include "opt_cpu.h" 34#include "opt_kstack_pages.h" 35#include "opt_pmap.h" 36#include "opt_sched.h" 37#include "opt_smp.h" 38 39#include <sys/param.h> 40#include <sys/systm.h> 41#include <sys/bus.h> 42#include <sys/cons.h> /* cngetc() */ 43#include <sys/cpuset.h> 44#ifdef GPROF 45#include <sys/gmon.h> 46#endif 47#include <sys/kernel.h> 48#include <sys/ktr.h> 49#include <sys/lock.h> 50#include <sys/malloc.h> 51#include <sys/memrange.h> 52#include <sys/mutex.h> 53#include <sys/pcpu.h> 54#include <sys/proc.h> 55#include <sys/sched.h> 56#include <sys/smp.h> 57#include <sys/sysctl.h> 58 59#include <vm/vm.h> 60#include <vm/vm_param.h> 61#include <vm/pmap.h> 62#include <vm/vm_kern.h> 63#include <vm/vm_extern.h> 64 65#include <x86/apicreg.h> 66#include <machine/clock.h> 67#include <machine/cputypes.h> 68#include <x86/mca.h> 69#include <machine/md_var.h> 70#include <machine/pcb.h> 71#include <machine/psl.h> 72#include <machine/smp.h> 73#include <machine/specialreg.h> 74#include <machine/cpu.h> 75 76#define WARMBOOT_TARGET 0 77#define WARMBOOT_OFF (KERNBASE + 0x0467) 78#define WARMBOOT_SEG (KERNBASE + 0x0469) 79 80#define CMOS_REG (0x70) 81#define CMOS_DATA (0x71) 82#define BIOS_RESET (0x0f) 83#define BIOS_WARM (0x0a) 84 85/* lock region used by kernel profiling */ 86int mcount_lock; 87 88int mp_naps; /* # of Applications processors */ 89int boot_cpu_id = -1; /* designated BSP */ 90 91extern struct pcpu __pcpu[]; 92 93/* AP uses this during bootstrap. Do not staticize. */ 94char *bootSTK; 95int bootAP; 96 97/* Free these after use */ 98void *bootstacks[MAXCPU]; 99void *dpcpu; 100 101struct pcb stoppcbs[MAXCPU]; 102struct susppcb **susppcbs; 103 104#ifdef COUNT_IPIS 105/* Interrupt counts. */ 106static u_long *ipi_preempt_counts[MAXCPU]; 107static u_long *ipi_ast_counts[MAXCPU]; 108u_long *ipi_invltlb_counts[MAXCPU]; 109u_long *ipi_invlrng_counts[MAXCPU]; 110u_long *ipi_invlpg_counts[MAXCPU]; 111u_long *ipi_invlcache_counts[MAXCPU]; 112u_long *ipi_rendezvous_counts[MAXCPU]; 113static u_long *ipi_hardclock_counts[MAXCPU]; 114#endif 115 116/* Default cpu_ops implementation. */ 117struct cpu_ops cpu_ops; 118 119/* 120 * Local data and functions. 121 */ 122 123static volatile cpuset_t ipi_stop_nmi_pending; 124 125/* used to hold the AP's until we are ready to release them */ 126struct mtx ap_boot_mtx; 127 128/* Set to 1 once we're ready to let the APs out of the pen. */ 129volatile int aps_ready = 0; 130 131/* 132 * Store data from cpu_add() until later in the boot when we actually setup 133 * the APs. 134 */ 135struct cpu_info cpu_info[MAX_APIC_ID + 1]; 136int cpu_apic_ids[MAXCPU]; 137int apic_cpuids[MAX_APIC_ID + 1]; 138 139/* Holds pending bitmap based IPIs per CPU */ 140volatile u_int cpu_ipi_pending[MAXCPU]; 141 142int cpu_logical; /* logical cpus per core */ 143int cpu_cores; /* cores per package */ 144 145static void release_aps(void *dummy); 146 147static u_int hyperthreading_cpus; /* logical cpus sharing L1 cache */ 148static int hyperthreading_allowed = 1; 149 150void 151mem_range_AP_init(void) 152{ 153 154 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 155 mem_range_softc.mr_op->initAP(&mem_range_softc); 156} 157 158static void 159topo_probe_amd(void) 160{ 161 int core_id_bits; 162 int id; 163 164 /* AMD processors do not support HTT. */ 165 cpu_logical = 1; 166 167 if ((amd_feature2 & AMDID2_CMP) == 0) { 168 cpu_cores = 1; 169 return; 170 } 171 172 core_id_bits = (cpu_procinfo2 & AMDID_COREID_SIZE) >> 173 AMDID_COREID_SIZE_SHIFT; 174 if (core_id_bits == 0) { 175 cpu_cores = (cpu_procinfo2 & AMDID_CMP_CORES) + 1; 176 return; 177 } 178 179 /* Fam 10h and newer should get here. */ 180 for (id = 0; id <= MAX_APIC_ID; id++) { 181 /* Check logical CPU availability. */ 182 if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) 183 continue; 184 /* Check if logical CPU has the same package ID. */ 185 if ((id >> core_id_bits) != (boot_cpu_id >> core_id_bits)) 186 continue; 187 cpu_cores++; 188 } 189} 190 191/* 192 * Round up to the next power of two, if necessary, and then 193 * take log2. 194 * Returns -1 if argument is zero. 195 */ 196static __inline int 197mask_width(u_int x) 198{ 199 200 return (fls(x << (1 - powerof2(x))) - 1); 201} 202 203static void 204topo_probe_0x4(void) 205{ 206 u_int p[4]; 207 int pkg_id_bits; 208 int core_id_bits; 209 int max_cores; 210 int max_logical; 211 int id; 212 213 /* Both zero and one here mean one logical processor per package. */ 214 max_logical = (cpu_feature & CPUID_HTT) != 0 ? 215 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; 216 if (max_logical <= 1) 217 return; 218 219 /* 220 * Because of uniformity assumption we examine only 221 * those logical processors that belong to the same 222 * package as BSP. Further, we count number of 223 * logical processors that belong to the same core 224 * as BSP thus deducing number of threads per core. 225 */ 226 if (cpu_high >= 0x4) { 227 cpuid_count(0x04, 0, p); 228 max_cores = ((p[0] >> 26) & 0x3f) + 1; 229 } else 230 max_cores = 1; 231 core_id_bits = mask_width(max_logical/max_cores); 232 if (core_id_bits < 0) 233 return; 234 pkg_id_bits = core_id_bits + mask_width(max_cores); 235 236 for (id = 0; id <= MAX_APIC_ID; id++) { 237 /* Check logical CPU availability. */ 238 if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) 239 continue; 240 /* Check if logical CPU has the same package ID. */ 241 if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits)) 242 continue; 243 cpu_cores++; 244 /* Check if logical CPU has the same package and core IDs. */ 245 if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits)) 246 cpu_logical++; 247 } 248 249 KASSERT(cpu_cores >= 1 && cpu_logical >= 1, 250 ("topo_probe_0x4 couldn't find BSP")); 251 252 cpu_cores /= cpu_logical; 253 hyperthreading_cpus = cpu_logical; 254} 255 256static void 257topo_probe_0xb(void) 258{ 259 u_int p[4]; 260 int bits; 261 int cnt; 262 int i; 263 int logical; 264 int type; 265 int x; 266 267 /* We only support three levels for now. */ 268 for (i = 0; i < 3; i++) { 269 cpuid_count(0x0b, i, p); 270 271 /* Fall back if CPU leaf 11 doesn't really exist. */ 272 if (i == 0 && p[1] == 0) { 273 topo_probe_0x4(); 274 return; 275 } 276 277 bits = p[0] & 0x1f; 278 logical = p[1] &= 0xffff; 279 type = (p[2] >> 8) & 0xff; 280 if (type == 0 || logical == 0) 281 break; 282 /* 283 * Because of uniformity assumption we examine only 284 * those logical processors that belong to the same 285 * package as BSP. 286 */ 287 for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) { 288 if (!cpu_info[x].cpu_present || 289 cpu_info[x].cpu_disabled) 290 continue; 291 if (x >> bits == boot_cpu_id >> bits) 292 cnt++; 293 } 294 if (type == CPUID_TYPE_SMT) 295 cpu_logical = cnt; 296 else if (type == CPUID_TYPE_CORE) 297 cpu_cores = cnt; 298 } 299 if (cpu_logical == 0) 300 cpu_logical = 1; 301 cpu_cores /= cpu_logical; 302} 303 304/* 305 * Both topology discovery code and code that consumes topology 306 * information assume top-down uniformity of the topology. 307 * That is, all physical packages must be identical and each 308 * core in a package must have the same number of threads. 309 * Topology information is queried only on BSP, on which this 310 * code runs and for which it can query CPUID information. 311 * Then topology is extrapolated on all packages using the 312 * uniformity assumption. 313 */ 314void 315topo_probe(void) 316{ 317 static int cpu_topo_probed = 0; 318 319 if (cpu_topo_probed) 320 return; 321 322 CPU_ZERO(&logical_cpus_mask); 323 if (mp_ncpus <= 1) 324 cpu_cores = cpu_logical = 1; 325 else if (cpu_vendor_id == CPU_VENDOR_AMD) 326 topo_probe_amd(); 327 else if (cpu_vendor_id == CPU_VENDOR_INTEL) { 328 /* 329 * See Intel(R) 64 Architecture Processor 330 * Topology Enumeration article for details. 331 * 332 * Note that 0x1 <= cpu_high < 4 case should be 333 * compatible with topo_probe_0x4() logic when 334 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) 335 * or it should trigger the fallback otherwise. 336 */ 337 if (cpu_high >= 0xb) 338 topo_probe_0xb(); 339 else if (cpu_high >= 0x1) 340 topo_probe_0x4(); 341 } 342 343 /* 344 * Fallback: assume each logical CPU is in separate 345 * physical package. That is, no multi-core, no SMT. 346 */ 347 if (cpu_cores == 0 || cpu_logical == 0) 348 cpu_cores = cpu_logical = 1; 349 cpu_topo_probed = 1; 350} 351 352struct cpu_group * 353cpu_topo(void) 354{ 355 int cg_flags; 356 357 /* 358 * Determine whether any threading flags are 359 * necessry. 360 */ 361 topo_probe(); 362 if (cpu_logical > 1 && hyperthreading_cpus) 363 cg_flags = CG_FLAG_HTT; 364 else if (cpu_logical > 1) 365 cg_flags = CG_FLAG_SMT; 366 else 367 cg_flags = 0; 368 if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { 369 printf("WARNING: Non-uniform processors.\n"); 370 printf("WARNING: Using suboptimal topology.\n"); 371 return (smp_topo_none()); 372 } 373 /* 374 * No multi-core or hyper-threaded. 375 */ 376 if (cpu_logical * cpu_cores == 1) 377 return (smp_topo_none()); 378 /* 379 * Only HTT no multi-core. 380 */ 381 if (cpu_logical > 1 && cpu_cores == 1) 382 return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags)); 383 /* 384 * Only multi-core no HTT. 385 */ 386 if (cpu_cores > 1 && cpu_logical == 1) 387 return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags)); 388 /* 389 * Both HTT and multi-core. 390 */ 391 return (smp_topo_2level(CG_SHARE_L2, cpu_cores, 392 CG_SHARE_L1, cpu_logical, cg_flags)); 393} 394 395 396void 397cpu_add(u_int apic_id, char boot_cpu) 398{ 399 400 if (apic_id > MAX_APIC_ID) { 401 panic("SMP: APIC ID %d too high", apic_id); 402 return; 403 } 404 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", 405 apic_id)); 406 cpu_info[apic_id].cpu_present = 1; 407 if (boot_cpu) { 408 KASSERT(boot_cpu_id == -1, 409 ("CPU %d claims to be BSP, but CPU %d already is", apic_id, 410 boot_cpu_id)); 411 boot_cpu_id = apic_id; 412 cpu_info[apic_id].cpu_bsp = 1; 413 } 414 if (mp_ncpus < MAXCPU) { 415 mp_ncpus++; 416 mp_maxid = mp_ncpus - 1; 417 } 418 if (bootverbose) 419 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : 420 "AP"); 421} 422 423void 424cpu_mp_setmaxid(void) 425{ 426 427 /* 428 * mp_ncpus and mp_maxid should be already set by calls to cpu_add(). 429 * If there were no calls to cpu_add() assume this is a UP system. 430 */ 431 if (mp_ncpus == 0) 432 mp_ncpus = 1; 433} 434 435int 436cpu_mp_probe(void) 437{ 438 439 /* 440 * Always record BSP in CPU map so that the mbuf init code works 441 * correctly. 442 */ 443 CPU_SETOF(0, &all_cpus); 444 return (mp_ncpus > 1); 445} 446 447/* 448 * Print various information about the SMP system hardware and setup. 449 */ 450void 451cpu_mp_announce(void) 452{ 453 const char *hyperthread; 454 int i; 455 456 printf("FreeBSD/SMP: %d package(s) x %d core(s)", 457 mp_ncpus / (cpu_cores * cpu_logical), cpu_cores); 458 if (hyperthreading_cpus > 1) 459 printf(" x %d HTT threads", cpu_logical); 460 else if (cpu_logical > 1) 461 printf(" x %d SMT threads", cpu_logical); 462 printf("\n"); 463 464 /* List active CPUs first. */ 465 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); 466 for (i = 1; i < mp_ncpus; i++) { 467 if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread) 468 hyperthread = "/HT"; 469 else 470 hyperthread = ""; 471 printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread, 472 cpu_apic_ids[i]); 473 } 474 475 /* List disabled CPUs last. */ 476 for (i = 0; i <= MAX_APIC_ID; i++) { 477 if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled) 478 continue; 479 if (cpu_info[i].cpu_hyperthread) 480 hyperthread = "/HT"; 481 else 482 hyperthread = ""; 483 printf(" cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread, 484 i); 485 } 486} 487 488void 489init_secondary_tail(void) 490{ 491 u_int cpuid; 492 493 /* 494 * On real hardware, switch to x2apic mode if possible. Do it 495 * after aps_ready was signalled, to avoid manipulating the 496 * mode while BSP might still want to send some IPI to us 497 * (second startup IPI is ignored on modern hardware etc). 498 */ 499 lapic_xapic_mode(); 500 501 /* Initialize the PAT MSR. */ 502 pmap_init_pat(); 503 504 /* set up CPU registers and state */ 505 cpu_setregs(); 506 507 /* set up SSE/NX */ 508 initializecpu(); 509 510 /* set up FPU state on the AP */ 511#ifdef __amd64__ 512 fpuinit(); 513#else 514 npxinit(false); 515#endif 516 517 if (cpu_ops.cpu_init) 518 cpu_ops.cpu_init(); 519 520 /* A quick check from sanity claus */ 521 cpuid = PCPU_GET(cpuid); 522 if (PCPU_GET(apic_id) != lapic_id()) { 523 printf("SMP: cpuid = %d\n", cpuid); 524 printf("SMP: actual apic_id = %d\n", lapic_id()); 525 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 526 panic("cpuid mismatch! boom!!"); 527 } 528 529 /* Initialize curthread. */ 530 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 531 PCPU_SET(curthread, PCPU_GET(idlethread)); 532 533 mca_init(); 534 535 mtx_lock_spin(&ap_boot_mtx); 536 537 /* Init local apic for irq's */ 538 lapic_setup(1); 539 540 /* Set memory range attributes for this CPU to match the BSP */ 541 mem_range_AP_init(); 542 543 smp_cpus++; 544 545 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); 546 printf("SMP: AP CPU #%d Launched!\n", cpuid); 547 548 /* Determine if we are a logical CPU. */ 549 /* XXX Calculation depends on cpu_logical being a power of 2, e.g. 2 */ 550 if (cpu_logical > 1 && PCPU_GET(apic_id) % cpu_logical != 0) 551 CPU_SET(cpuid, &logical_cpus_mask); 552 553 if (bootverbose) 554 lapic_dump("AP"); 555 556 if (smp_cpus == mp_ncpus) { 557 /* enable IPI's, tlb shootdown, freezes etc */ 558 atomic_store_rel_int(&smp_started, 1); 559 } 560 561#ifdef __amd64__ 562 /* 563 * Enable global pages TLB extension 564 * This also implicitly flushes the TLB 565 */ 566 load_cr4(rcr4() | CR4_PGE); 567 if (pmap_pcid_enabled) 568 load_cr4(rcr4() | CR4_PCIDE); 569 load_ds(_udatasel); 570 load_es(_udatasel); 571 load_fs(_ufssel); 572#endif 573 574 mtx_unlock_spin(&ap_boot_mtx); 575 576 /* Wait until all the AP's are up. */ 577 while (atomic_load_acq_int(&smp_started) == 0) 578 ia32_pause(); 579 580 /* Start per-CPU event timers. */ 581 cpu_initclocks_ap(); 582 583 sched_throw(NULL); 584 585 panic("scheduler returned us to %s", __func__); 586 /* NOTREACHED */ 587} 588 589/******************************************************************* 590 * local functions and data 591 */ 592 593/* 594 * We tell the I/O APIC code about all the CPUs we want to receive 595 * interrupts. If we don't want certain CPUs to receive IRQs we 596 * can simply not tell the I/O APIC code about them in this function. 597 * We also do not tell it about the BSP since it tells itself about 598 * the BSP internally to work with UP kernels and on UP machines. 599 */ 600void 601set_interrupt_apic_ids(void) 602{ 603 u_int i, apic_id; 604 605 for (i = 0; i < MAXCPU; i++) { 606 apic_id = cpu_apic_ids[i]; 607 if (apic_id == -1) 608 continue; 609 if (cpu_info[apic_id].cpu_bsp) 610 continue; 611 if (cpu_info[apic_id].cpu_disabled) 612 continue; 613 614 /* Don't let hyperthreads service interrupts. */ 615 if (cpu_logical > 1 && 616 apic_id % cpu_logical != 0) 617 continue; 618 619 intr_add_cpu(i); 620 } 621} 622 623/* 624 * Assign logical CPU IDs to local APICs. 625 */ 626void 627assign_cpu_ids(void) 628{ 629 u_int i; 630 631 TUNABLE_INT_FETCH("machdep.hyperthreading_allowed", 632 &hyperthreading_allowed); 633 634 /* Check for explicitly disabled CPUs. */ 635 for (i = 0; i <= MAX_APIC_ID; i++) { 636 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) 637 continue; 638 639 if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) { 640 cpu_info[i].cpu_hyperthread = 1; 641 642 /* 643 * Don't use HT CPU if it has been disabled by a 644 * tunable. 645 */ 646 if (hyperthreading_allowed == 0) { 647 cpu_info[i].cpu_disabled = 1; 648 continue; 649 } 650 } 651 652 /* Don't use this CPU if it has been disabled by a tunable. */ 653 if (resource_disabled("lapic", i)) { 654 cpu_info[i].cpu_disabled = 1; 655 continue; 656 } 657 } 658 659 if (hyperthreading_allowed == 0 && hyperthreading_cpus > 1) { 660 hyperthreading_cpus = 0; 661 cpu_logical = 1; 662 } 663 664 /* 665 * Assign CPU IDs to local APIC IDs and disable any CPUs 666 * beyond MAXCPU. CPU 0 is always assigned to the BSP. 667 * 668 * To minimize confusion for userland, we attempt to number 669 * CPUs such that all threads and cores in a package are 670 * grouped together. For now we assume that the BSP is always 671 * the first thread in a package and just start adding APs 672 * starting with the BSP's APIC ID. 673 */ 674 mp_ncpus = 1; 675 cpu_apic_ids[0] = boot_cpu_id; 676 apic_cpuids[boot_cpu_id] = 0; 677 for (i = boot_cpu_id + 1; i != boot_cpu_id; 678 i == MAX_APIC_ID ? i = 0 : i++) { 679 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || 680 cpu_info[i].cpu_disabled) 681 continue; 682 683 if (mp_ncpus < MAXCPU) { 684 cpu_apic_ids[mp_ncpus] = i; 685 apic_cpuids[i] = mp_ncpus; 686 mp_ncpus++; 687 } else 688 cpu_info[i].cpu_disabled = 1; 689 } 690 KASSERT(mp_maxid >= mp_ncpus - 1, 691 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 692 mp_ncpus)); 693} 694 695#ifdef COUNT_XINVLTLB_HITS 696u_int xhits_gbl[MAXCPU]; 697u_int xhits_pg[MAXCPU]; 698u_int xhits_rng[MAXCPU]; 699static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); 700SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, 701 sizeof(xhits_gbl), "IU", ""); 702SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, 703 sizeof(xhits_pg), "IU", ""); 704SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, 705 sizeof(xhits_rng), "IU", ""); 706 707u_int ipi_global; 708u_int ipi_page; 709u_int ipi_range; 710u_int ipi_range_size; 711SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); 712SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); 713SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); 714SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 715 0, ""); 716#endif /* COUNT_XINVLTLB_HITS */ 717 718/* 719 * Init and startup IPI. 720 */ 721void 722ipi_startup(int apic_id, int vector) 723{ 724 725 /* 726 * This attempts to follow the algorithm described in the 727 * Intel Multiprocessor Specification v1.4 in section B.4. 728 * For each IPI, we allow the local APIC ~20us to deliver the 729 * IPI. If that times out, we panic. 730 */ 731 732 /* 733 * first we do an INIT IPI: this INIT IPI might be run, resetting 734 * and running the target CPU. OR this INIT IPI might be latched (P5 735 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 736 * ignored. 737 */ 738 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 739 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 740 lapic_ipi_wait(100); 741 742 /* Explicitly deassert the INIT IPI. */ 743 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 744 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 745 apic_id); 746 747 DELAY(10000); /* wait ~10mS */ 748 749 /* 750 * next we do a STARTUP IPI: the previous INIT IPI might still be 751 * latched, (P5 bug) this 1st STARTUP would then terminate 752 * immediately, and the previously started INIT IPI would continue. OR 753 * the previous INIT IPI has already run. and this STARTUP IPI will 754 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 755 * will run. 756 */ 757 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 758 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 759 vector, apic_id); 760 if (!lapic_ipi_wait(100)) 761 panic("Failed to deliver first STARTUP IPI to APIC %d", 762 apic_id); 763 DELAY(200); /* wait ~200uS */ 764 765 /* 766 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 767 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 768 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 769 * recognized after hardware RESET or INIT IPI. 770 */ 771 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 772 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 773 vector, apic_id); 774 if (!lapic_ipi_wait(100)) 775 panic("Failed to deliver second STARTUP IPI to APIC %d", 776 apic_id); 777 778 DELAY(200); /* wait ~200uS */ 779} 780 781/* 782 * Send an IPI to specified CPU handling the bitmap logic. 783 */ 784void 785ipi_send_cpu(int cpu, u_int ipi) 786{ 787 u_int bitmap, old_pending, new_pending; 788 789 KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); 790 791 if (IPI_IS_BITMAPED(ipi)) { 792 bitmap = 1 << ipi; 793 ipi = IPI_BITMAP_VECTOR; 794 do { 795 old_pending = cpu_ipi_pending[cpu]; 796 new_pending = old_pending | bitmap; 797 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], 798 old_pending, new_pending)); 799 if (old_pending) 800 return; 801 } 802 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); 803} 804 805void 806ipi_bitmap_handler(struct trapframe frame) 807{ 808 struct trapframe *oldframe; 809 struct thread *td; 810 int cpu = PCPU_GET(cpuid); 811 u_int ipi_bitmap; 812 813 critical_enter(); 814 td = curthread; 815 td->td_intr_nesting_level++; 816 oldframe = td->td_intr_frame; 817 td->td_intr_frame = &frame; 818 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 819 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 820#ifdef COUNT_IPIS 821 (*ipi_preempt_counts[cpu])++; 822#endif 823 sched_preempt(td); 824 } 825 if (ipi_bitmap & (1 << IPI_AST)) { 826#ifdef COUNT_IPIS 827 (*ipi_ast_counts[cpu])++; 828#endif 829 /* Nothing to do for AST */ 830 } 831 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { 832#ifdef COUNT_IPIS 833 (*ipi_hardclock_counts[cpu])++; 834#endif 835 hardclockintr(); 836 } 837 td->td_intr_frame = oldframe; 838 td->td_intr_nesting_level--; 839 critical_exit(); 840} 841 842/* 843 * send an IPI to a set of cpus. 844 */ 845void 846ipi_selected(cpuset_t cpus, u_int ipi) 847{ 848 int cpu; 849 850 /* 851 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 852 * of help in order to understand what is the source. 853 * Set the mask of receiving CPUs for this purpose. 854 */ 855 if (ipi == IPI_STOP_HARD) 856 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus); 857 858 while ((cpu = CPU_FFS(&cpus)) != 0) { 859 cpu--; 860 CPU_CLR(cpu, &cpus); 861 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 862 ipi_send_cpu(cpu, ipi); 863 } 864} 865 866/* 867 * send an IPI to a specific CPU. 868 */ 869void 870ipi_cpu(int cpu, u_int ipi) 871{ 872 873 /* 874 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 875 * of help in order to understand what is the source. 876 * Set the mask of receiving CPUs for this purpose. 877 */ 878 if (ipi == IPI_STOP_HARD) 879 CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending); 880 881 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 882 ipi_send_cpu(cpu, ipi); 883} 884 885/* 886 * send an IPI to all CPUs EXCEPT myself 887 */ 888void 889ipi_all_but_self(u_int ipi) 890{ 891 cpuset_t other_cpus; 892 893 other_cpus = all_cpus; 894 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 895 if (IPI_IS_BITMAPED(ipi)) { 896 ipi_selected(other_cpus, ipi); 897 return; 898 } 899 900 /* 901 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 902 * of help in order to understand what is the source. 903 * Set the mask of receiving CPUs for this purpose. 904 */ 905 if (ipi == IPI_STOP_HARD) 906 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus); 907 908 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 909 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 910} 911 912int 913ipi_nmi_handler() 914{ 915 u_int cpuid; 916 917 /* 918 * As long as there is not a simple way to know about a NMI's 919 * source, if the bitmask for the current CPU is present in 920 * the global pending bitword an IPI_STOP_HARD has been issued 921 * and should be handled. 922 */ 923 cpuid = PCPU_GET(cpuid); 924 if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending)) 925 return (1); 926 927 CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending); 928 cpustop_handler(); 929 return (0); 930} 931 932/* 933 * Handle an IPI_STOP by saving our current context and spinning until we 934 * are resumed. 935 */ 936void 937cpustop_handler(void) 938{ 939 u_int cpu; 940 941 cpu = PCPU_GET(cpuid); 942 943 savectx(&stoppcbs[cpu]); 944 945 /* Indicate that we are stopped */ 946 CPU_SET_ATOMIC(cpu, &stopped_cpus); 947 948 /* Wait for restart */ 949 while (!CPU_ISSET(cpu, &started_cpus)) 950 ia32_pause(); 951 952 CPU_CLR_ATOMIC(cpu, &started_cpus); 953 CPU_CLR_ATOMIC(cpu, &stopped_cpus); 954 955#if defined(__amd64__) && defined(DDB) 956 amd64_db_resume_dbreg(); 957#endif 958 959 if (cpu == 0 && cpustop_restartfunc != NULL) { 960 cpustop_restartfunc(); 961 cpustop_restartfunc = NULL; 962 } 963} 964 965/* 966 * Handle an IPI_SUSPEND by saving our current context and spinning until we 967 * are resumed. 968 */ 969void 970cpususpend_handler(void) 971{ 972 u_int cpu; 973 974 mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); 975 976 cpu = PCPU_GET(cpuid); 977 if (savectx(&susppcbs[cpu]->sp_pcb)) { 978#ifdef __amd64__ 979 fpususpend(susppcbs[cpu]->sp_fpususpend); 980#else 981 npxsuspend(susppcbs[cpu]->sp_fpususpend); 982#endif 983 wbinvd(); 984 CPU_SET_ATOMIC(cpu, &suspended_cpus); 985 } else { 986#ifdef __amd64__ 987 fpuresume(susppcbs[cpu]->sp_fpususpend); 988#else 989 npxresume(susppcbs[cpu]->sp_fpususpend); 990#endif 991 pmap_init_pat(); 992 initializecpu(); 993 PCPU_SET(switchtime, 0); 994 PCPU_SET(switchticks, ticks); 995 996 /* Indicate that we are resumed */ 997 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 998 } 999 1000 /* Wait for resume */ 1001 while (!CPU_ISSET(cpu, &started_cpus)) 1002 ia32_pause(); 1003 1004 if (cpu_ops.cpu_resume) 1005 cpu_ops.cpu_resume(); 1006#ifdef __amd64__ 1007 if (vmm_resume_p) 1008 vmm_resume_p(); 1009#endif 1010 1011 /* Resume MCA and local APIC */ 1012 lapic_xapic_mode(); 1013 mca_resume(); 1014 lapic_setup(0); 1015 1016 /* Indicate that we are resumed */ 1017 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1018 CPU_CLR_ATOMIC(cpu, &started_cpus); 1019} 1020 1021 1022void 1023invlcache_handler(void) 1024{ 1025#ifdef COUNT_IPIS 1026 (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; 1027#endif /* COUNT_IPIS */ 1028 1029 wbinvd(); 1030 atomic_add_int(&smp_tlb_wait, 1); 1031} 1032 1033/* 1034 * This is called once the rest of the system is up and running and we're 1035 * ready to let the AP's out of the pen. 1036 */ 1037static void 1038release_aps(void *dummy __unused) 1039{ 1040 1041 if (mp_ncpus == 1) 1042 return; 1043 atomic_store_rel_int(&aps_ready, 1); 1044 while (smp_started == 0) 1045 ia32_pause(); 1046} 1047SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1048 1049#ifdef COUNT_IPIS 1050/* 1051 * Setup interrupt counters for IPI handlers. 1052 */ 1053static void 1054mp_ipi_intrcnt(void *dummy) 1055{ 1056 char buf[64]; 1057 int i; 1058 1059 CPU_FOREACH(i) { 1060 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); 1061 intrcnt_add(buf, &ipi_invltlb_counts[i]); 1062 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); 1063 intrcnt_add(buf, &ipi_invlrng_counts[i]); 1064 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); 1065 intrcnt_add(buf, &ipi_invlpg_counts[i]); 1066 snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); 1067 intrcnt_add(buf, &ipi_invlcache_counts[i]); 1068 snprintf(buf, sizeof(buf), "cpu%d:preempt", i); 1069 intrcnt_add(buf, &ipi_preempt_counts[i]); 1070 snprintf(buf, sizeof(buf), "cpu%d:ast", i); 1071 intrcnt_add(buf, &ipi_ast_counts[i]); 1072 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); 1073 intrcnt_add(buf, &ipi_rendezvous_counts[i]); 1074 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); 1075 intrcnt_add(buf, &ipi_hardclock_counts[i]); 1076 } 1077} 1078SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); 1079#endif 1080 1081/* 1082 * Flush the TLB on other CPU's 1083 */ 1084 1085/* Variables needed for SMP tlb shootdown. */ 1086static vm_offset_t smp_tlb_addr1, smp_tlb_addr2; 1087pmap_t smp_tlb_pmap; 1088volatile int smp_tlb_wait; 1089 1090#ifdef __amd64__ 1091#define read_eflags() read_rflags() 1092#endif 1093 1094static void 1095smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, 1096 vm_offset_t addr1, vm_offset_t addr2) 1097{ 1098 int cpu, ncpu, othercpus; 1099 1100 othercpus = mp_ncpus - 1; /* does not shootdown self */ 1101 1102 /* 1103 * Check for other cpus. Return if none. 1104 */ 1105 if (CPU_ISFULLSET(&mask)) { 1106 if (othercpus < 1) 1107 return; 1108 } else { 1109 CPU_CLR(PCPU_GET(cpuid), &mask); 1110 if (CPU_EMPTY(&mask)) 1111 return; 1112 } 1113 1114 if (!(read_eflags() & PSL_I)) 1115 panic("%s: interrupts disabled", __func__); 1116 mtx_lock_spin(&smp_ipi_mtx); 1117 smp_tlb_addr1 = addr1; 1118 smp_tlb_addr2 = addr2; 1119 smp_tlb_pmap = pmap; 1120 smp_tlb_wait = 0; 1121 if (CPU_ISFULLSET(&mask)) { 1122 ncpu = othercpus; 1123 ipi_all_but_self(vector); 1124 } else { 1125 ncpu = 0; 1126 while ((cpu = CPU_FFS(&mask)) != 0) { 1127 cpu--; 1128 CPU_CLR(cpu, &mask); 1129 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, 1130 cpu, vector); 1131 ipi_send_cpu(cpu, vector); 1132 ncpu++; 1133 } 1134 } 1135 while (smp_tlb_wait < ncpu) 1136 ia32_pause(); 1137 mtx_unlock_spin(&smp_ipi_mtx); 1138} 1139 1140void 1141smp_masked_invltlb(cpuset_t mask, pmap_t pmap) 1142{ 1143 1144 if (smp_started) { 1145 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0); 1146#ifdef COUNT_XINVLTLB_HITS 1147 ipi_global++; 1148#endif 1149 } 1150} 1151 1152void 1153smp_masked_invlpg(cpuset_t mask, vm_offset_t addr) 1154{ 1155 1156 if (smp_started) { 1157 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, NULL, addr, 0); 1158#ifdef COUNT_XINVLTLB_HITS 1159 ipi_page++; 1160#endif 1161 } 1162} 1163 1164void 1165smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2) 1166{ 1167 1168 if (smp_started) { 1169 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, NULL, 1170 addr1, addr2); 1171#ifdef COUNT_XINVLTLB_HITS 1172 ipi_range++; 1173 ipi_range_size += (addr2 - addr1) / PAGE_SIZE; 1174#endif 1175 } 1176} 1177 1178void 1179smp_cache_flush(void) 1180{ 1181 1182 if (smp_started) { 1183 smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL, 1184 0, 0); 1185 } 1186} 1187 1188/* 1189 * Handlers for TLB related IPIs 1190 */ 1191void 1192invltlb_handler(void) 1193{ 1194#ifdef COUNT_XINVLTLB_HITS 1195 xhits_gbl[PCPU_GET(cpuid)]++; 1196#endif /* COUNT_XINVLTLB_HITS */ 1197#ifdef COUNT_IPIS 1198 (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; 1199#endif /* COUNT_IPIS */ 1200 1201 if (smp_tlb_pmap == kernel_pmap) 1202 invltlb_glob(); 1203 else 1204 invltlb(); 1205 atomic_add_int(&smp_tlb_wait, 1); 1206} 1207 1208void 1209invlpg_handler(void) 1210{ 1211#ifdef COUNT_XINVLTLB_HITS 1212 xhits_pg[PCPU_GET(cpuid)]++; 1213#endif /* COUNT_XINVLTLB_HITS */ 1214#ifdef COUNT_IPIS 1215 (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; 1216#endif /* COUNT_IPIS */ 1217 1218 invlpg(smp_tlb_addr1); 1219 atomic_add_int(&smp_tlb_wait, 1); 1220} 1221 1222void 1223invlrng_handler(void) 1224{ 1225 vm_offset_t addr; 1226 1227#ifdef COUNT_XINVLTLB_HITS 1228 xhits_rng[PCPU_GET(cpuid)]++; 1229#endif /* COUNT_XINVLTLB_HITS */ 1230#ifdef COUNT_IPIS 1231 (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; 1232#endif /* COUNT_IPIS */ 1233 1234 addr = smp_tlb_addr1; 1235 do { 1236 invlpg(addr); 1237 addr += PAGE_SIZE; 1238 } while (addr < smp_tlb_addr2); 1239 1240 atomic_add_int(&smp_tlb_wait, 1); 1241} 1242