mp_x86.c revision 318782
1224653Sjonathan/*- 2224653Sjonathan * Copyright (c) 1996, by Steve Passe 3224653Sjonathan * Copyright (c) 2003, by Peter Wemm 4224653Sjonathan * All rights reserved. 5224653Sjonathan * 6224653Sjonathan * Redistribution and use in source and binary forms, with or without 7224653Sjonathan * modification, are permitted provided that the following conditions 8224653Sjonathan * are met: 9224653Sjonathan * 1. Redistributions of source code must retain the above copyright 10224653Sjonathan * notice, this list of conditions and the following disclaimer. 11224653Sjonathan * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: stable/11/sys/x86/x86/mp_x86.c 318782 2017-05-24 11:09:06Z avg $"); 29 30#ifdef __i386__ 31#include "opt_apic.h" 32#endif 33#include "opt_cpu.h" 34#include "opt_isa.h" 35#include "opt_kstack_pages.h" 36#include "opt_pmap.h" 37#include "opt_sched.h" 38#include "opt_smp.h" 39 40#include <sys/param.h> 41#include <sys/systm.h> 42#include <sys/bus.h> 43#include <sys/cons.h> /* cngetc() */ 44#include <sys/cpuset.h> 45#ifdef GPROF 46#include <sys/gmon.h> 47#endif 48#include <sys/kernel.h> 49#include <sys/ktr.h> 50#include <sys/lock.h> 51#include <sys/malloc.h> 52#include <sys/memrange.h> 53#include <sys/mutex.h> 54#include <sys/pcpu.h> 55#include <sys/proc.h> 56#include <sys/sched.h> 57#include <sys/smp.h> 58#include <sys/sysctl.h> 59 60#include <vm/vm.h> 61#include <vm/vm_param.h> 62#include <vm/pmap.h> 63#include <vm/vm_kern.h> 64#include <vm/vm_extern.h> 65 66#include <x86/apicreg.h> 67#include <machine/clock.h> 68#include <machine/cputypes.h> 69#include <x86/mca.h> 70#include <machine/md_var.h> 71#include <machine/pcb.h> 72#include <machine/psl.h> 73#include <machine/smp.h> 74#include <machine/specialreg.h> 75#include <machine/cpu.h> 76 77#define WARMBOOT_TARGET 0 78#define WARMBOOT_OFF (KERNBASE + 0x0467) 79#define WARMBOOT_SEG (KERNBASE + 0x0469) 80 81#define CMOS_REG (0x70) 82#define CMOS_DATA (0x71) 83#define BIOS_RESET (0x0f) 84#define BIOS_WARM (0x0a) 85 86/* lock region used by kernel profiling */ 87int mcount_lock; 88 89int mp_naps; /* # of Applications processors */ 90int boot_cpu_id = -1; /* designated BSP */ 91 92extern struct pcpu __pcpu[]; 93 94/* AP uses this during bootstrap. Do not staticize. */ 95char *bootSTK; 96int bootAP; 97 98/* Free these after use */ 99void *bootstacks[MAXCPU]; 100void *dpcpu; 101 102struct pcb stoppcbs[MAXCPU]; 103struct susppcb **susppcbs; 104 105#ifdef COUNT_IPIS 106/* Interrupt counts. */ 107static u_long *ipi_preempt_counts[MAXCPU]; 108static u_long *ipi_ast_counts[MAXCPU]; 109u_long *ipi_invltlb_counts[MAXCPU]; 110u_long *ipi_invlrng_counts[MAXCPU]; 111u_long *ipi_invlpg_counts[MAXCPU]; 112u_long *ipi_invlcache_counts[MAXCPU]; 113u_long *ipi_rendezvous_counts[MAXCPU]; 114static u_long *ipi_hardclock_counts[MAXCPU]; 115#endif 116 117/* Default cpu_ops implementation. */ 118struct cpu_ops cpu_ops; 119 120/* 121 * Local data and functions. 122 */ 123 124static volatile cpuset_t ipi_stop_nmi_pending; 125 126/* used to hold the AP's until we are ready to release them */ 127struct mtx ap_boot_mtx; 128 129/* Set to 1 once we're ready to let the APs out of the pen. */ 130volatile int aps_ready = 0; 131 132/* 133 * Store data from cpu_add() until later in the boot when we actually setup 134 * the APs. 135 */ 136struct cpu_info cpu_info[MAX_APIC_ID + 1]; 137int apic_cpuids[MAX_APIC_ID + 1]; 138int cpu_apic_ids[MAXCPU]; 139 140/* Holds pending bitmap based IPIs per CPU */ 141volatile u_int cpu_ipi_pending[MAXCPU]; 142 143static void release_aps(void *dummy); 144static void cpustop_handler_post(u_int cpu); 145 146static int hyperthreading_allowed = 1; 147SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN, 148 &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs"); 149 150static struct topo_node topo_root; 151 152static int pkg_id_shift; 153static int core_id_shift; 154static int disabled_cpus; 155 156struct cache_info { 157 int id_shift; 158 int present; 159} static caches[MAX_CACHE_LEVELS]; 160 161void 162mem_range_AP_init(void) 163{ 164 165 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 166 mem_range_softc.mr_op->initAP(&mem_range_softc); 167} 168 169/* 170 * Round up to the next power of two, if necessary, and then 171 * take log2. 172 * Returns -1 if argument is zero. 173 */ 174static __inline int 175mask_width(u_int x) 176{ 177 178 return (fls(x << (1 - powerof2(x))) - 1); 179} 180 181/* 182 * Add a cache level to the cache topology description. 183 */ 184static int 185add_deterministic_cache(int type, int level, int share_count) 186{ 187 188 if (type == 0) 189 return (0); 190 if (type > 3) { 191 printf("unexpected cache type %d\n", type); 192 return (1); 193 } 194 if (type == 2) /* ignore instruction cache */ 195 return (1); 196 if (level == 0 || level > MAX_CACHE_LEVELS) { 197 printf("unexpected cache level %d\n", type); 198 return (1); 199 } 200 201 if (caches[level - 1].present) { 202 printf("WARNING: multiple entries for L%u data cache\n", level); 203 printf("%u => %u\n", caches[level - 1].id_shift, 204 mask_width(share_count)); 205 } 206 caches[level - 1].id_shift = mask_width(share_count); 207 caches[level - 1].present = 1; 208 209 if (caches[level - 1].id_shift > pkg_id_shift) { 210 printf("WARNING: L%u data cache covers more " 211 "APIC IDs than a package\n", level); 212 printf("%u > %u\n", caches[level - 1].id_shift, pkg_id_shift); 213 caches[level - 1].id_shift = pkg_id_shift; 214 } 215 if (caches[level - 1].id_shift < core_id_shift) { 216 printf("WARNING: L%u data cache covers less " 217 "APIC IDs than a core\n", level); 218 printf("%u < %u\n", caches[level - 1].id_shift, core_id_shift); 219 caches[level - 1].id_shift = core_id_shift; 220 } 221 222 return (1); 223} 224 225/* 226 * Determine topology of processing units and caches for AMD CPUs. 227 * See: 228 * - AMD CPUID Specification (Publication # 25481) 229 * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559) 230 * - BKDG For AMD Family 10h Processors (Publication # 31116) 231 * - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301) 232 * - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751) 233 */ 234static void 235topo_probe_amd(void) 236{ 237 u_int p[4]; 238 uint64_t v; 239 int level; 240 int nodes_per_socket; 241 int share_count; 242 int type; 243 int i; 244 245 /* No multi-core capability. */ 246 if ((amd_feature2 & AMDID2_CMP) == 0) 247 return; 248 249 /* For families 10h and newer. */ 250 pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >> 251 AMDID_COREID_SIZE_SHIFT; 252 253 /* For 0Fh family. */ 254 if (pkg_id_shift == 0) 255 pkg_id_shift = 256 mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1); 257 258 /* 259 * Families prior to 16h define the following value as 260 * cores per compute unit and we don't really care about the AMD 261 * compute units at the moment. Perhaps we should treat them as 262 * cores and cores within the compute units as hardware threads, 263 * but that's up for debate. 264 * Later families define the value as threads per compute unit, 265 * so we are following AMD's nomenclature here. 266 */ 267 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 && 268 CPUID_TO_FAMILY(cpu_id) >= 0x16) { 269 cpuid_count(0x8000001e, 0, p); 270 share_count = ((p[1] >> 8) & 0xff) + 1; 271 core_id_shift = mask_width(share_count); 272 } 273 274 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) { 275 for (i = 0; ; i++) { 276 cpuid_count(0x8000001d, i, p); 277 type = p[0] & 0x1f; 278 level = (p[0] >> 5) & 0x7; 279 share_count = 1 + ((p[0] >> 14) & 0xfff); 280 281 if (!add_deterministic_cache(type, level, share_count)) 282 break; 283 } 284 } else { 285 if (cpu_exthigh >= 0x80000005) { 286 cpuid_count(0x80000005, 0, p); 287 if (((p[2] >> 24) & 0xff) != 0) { 288 caches[0].id_shift = 0; 289 caches[0].present = 1; 290 } 291 } 292 if (cpu_exthigh >= 0x80000006) { 293 cpuid_count(0x80000006, 0, p); 294 if (((p[2] >> 16) & 0xffff) != 0) { 295 caches[1].id_shift = 0; 296 caches[1].present = 1; 297 } 298 if (((p[3] >> 18) & 0x3fff) != 0) { 299 nodes_per_socket = 1; 300 if ((amd_feature2 & AMDID2_NODE_ID) != 0) { 301 /* 302 * Handle multi-node processors that 303 * have multiple chips, each with its 304 * own L3 cache, on the same die. 305 */ 306 v = rdmsr(0xc001100c); 307 nodes_per_socket = 1 + ((v >> 3) & 0x7); 308 } 309 caches[2].id_shift = 310 pkg_id_shift - mask_width(nodes_per_socket); 311 caches[2].present = 1; 312 } 313 } 314 } 315} 316 317/* 318 * Determine topology of processing units for Intel CPUs 319 * using CPUID Leaf 1 and Leaf 4, if supported. 320 * See: 321 * - Intel 64 Architecture Processor Topology Enumeration 322 * - Intel 64 and IA-32 ArchitecturesSoftware Developer���s Manual, 323 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 324 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 325 */ 326static void 327topo_probe_intel_0x4(void) 328{ 329 u_int p[4]; 330 int max_cores; 331 int max_logical; 332 333 /* Both zero and one here mean one logical processor per package. */ 334 max_logical = (cpu_feature & CPUID_HTT) != 0 ? 335 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; 336 if (max_logical <= 1) 337 return; 338 339 if (cpu_high >= 0x4) { 340 cpuid_count(0x04, 0, p); 341 max_cores = ((p[0] >> 26) & 0x3f) + 1; 342 } else 343 max_cores = 1; 344 345 core_id_shift = mask_width(max_logical/max_cores); 346 KASSERT(core_id_shift >= 0, 347 ("intel topo: max_cores > max_logical\n")); 348 pkg_id_shift = core_id_shift + mask_width(max_cores); 349} 350 351/* 352 * Determine topology of processing units for Intel CPUs 353 * using CPUID Leaf 11, if supported. 354 * See: 355 * - Intel 64 Architecture Processor Topology Enumeration 356 * - Intel 64 and IA-32 ArchitecturesSoftware Developer���s Manual, 357 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 358 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 359 */ 360static void 361topo_probe_intel_0xb(void) 362{ 363 u_int p[4]; 364 int bits; 365 int type; 366 int i; 367 368 /* Fall back if CPU leaf 11 doesn't really exist. */ 369 cpuid_count(0x0b, 0, p); 370 if (p[1] == 0) { 371 topo_probe_intel_0x4(); 372 return; 373 } 374 375 /* We only support three levels for now. */ 376 for (i = 0; ; i++) { 377 cpuid_count(0x0b, i, p); 378 379 bits = p[0] & 0x1f; 380 type = (p[2] >> 8) & 0xff; 381 382 if (type == 0) 383 break; 384 385 /* TODO: check for duplicate (re-)assignment */ 386 if (type == CPUID_TYPE_SMT) 387 core_id_shift = bits; 388 else if (type == CPUID_TYPE_CORE) 389 pkg_id_shift = bits; 390 else 391 printf("unknown CPU level type %d\n", type); 392 } 393 394 if (pkg_id_shift < core_id_shift) { 395 printf("WARNING: core covers more APIC IDs than a package\n"); 396 core_id_shift = pkg_id_shift; 397 } 398} 399 400/* 401 * Determine topology of caches for Intel CPUs. 402 * See: 403 * - Intel 64 Architecture Processor Topology Enumeration 404 * - Intel 64 and IA-32 Architectures Software Developer���s Manual 405 * Volume 2A: Instruction Set Reference, A-M, 406 * CPUID instruction 407 */ 408static void 409topo_probe_intel_caches(void) 410{ 411 u_int p[4]; 412 int level; 413 int share_count; 414 int type; 415 int i; 416 417 if (cpu_high < 0x4) { 418 /* 419 * Available cache level and sizes can be determined 420 * via CPUID leaf 2, but that requires a huge table of hardcoded 421 * values, so for now just assume L1 and L2 caches potentially 422 * shared only by HTT processing units, if HTT is present. 423 */ 424 caches[0].id_shift = pkg_id_shift; 425 caches[0].present = 1; 426 caches[1].id_shift = pkg_id_shift; 427 caches[1].present = 1; 428 return; 429 } 430 431 for (i = 0; ; i++) { 432 cpuid_count(0x4, i, p); 433 type = p[0] & 0x1f; 434 level = (p[0] >> 5) & 0x7; 435 share_count = 1 + ((p[0] >> 14) & 0xfff); 436 437 if (!add_deterministic_cache(type, level, share_count)) 438 break; 439 } 440} 441 442/* 443 * Determine topology of processing units and caches for Intel CPUs. 444 * See: 445 * - Intel 64 Architecture Processor Topology Enumeration 446 */ 447static void 448topo_probe_intel(void) 449{ 450 451 /* 452 * Note that 0x1 <= cpu_high < 4 case should be 453 * compatible with topo_probe_intel_0x4() logic when 454 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) 455 * or it should trigger the fallback otherwise. 456 */ 457 if (cpu_high >= 0xb) 458 topo_probe_intel_0xb(); 459 else if (cpu_high >= 0x1) 460 topo_probe_intel_0x4(); 461 462 topo_probe_intel_caches(); 463} 464 465/* 466 * Topology information is queried only on BSP, on which this 467 * code runs and for which it can query CPUID information. 468 * Then topology is extrapolated on all packages using an 469 * assumption that APIC ID to hardware component ID mapping is 470 * homogenious. 471 * That doesn't necesserily imply that the topology is uniform. 472 */ 473void 474topo_probe(void) 475{ 476 static int cpu_topo_probed = 0; 477 struct x86_topo_layer { 478 int type; 479 int subtype; 480 int id_shift; 481 } topo_layers[MAX_CACHE_LEVELS + 3]; 482 struct topo_node *parent; 483 struct topo_node *node; 484 int layer; 485 int nlayers; 486 int node_id; 487 int i; 488 489 if (cpu_topo_probed) 490 return; 491 492 CPU_ZERO(&logical_cpus_mask); 493 494 if (mp_ncpus <= 1) 495 ; /* nothing */ 496 else if (cpu_vendor_id == CPU_VENDOR_AMD) 497 topo_probe_amd(); 498 else if (cpu_vendor_id == CPU_VENDOR_INTEL) 499 topo_probe_intel(); 500 501 KASSERT(pkg_id_shift >= core_id_shift, 502 ("bug in APIC topology discovery")); 503 504 nlayers = 0; 505 bzero(topo_layers, sizeof(topo_layers)); 506 507 topo_layers[nlayers].type = TOPO_TYPE_PKG; 508 topo_layers[nlayers].id_shift = pkg_id_shift; 509 if (bootverbose) 510 printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift); 511 nlayers++; 512 513 /* 514 * Consider all caches to be within a package/chip 515 * and "in front" of all sub-components like 516 * cores and hardware threads. 517 */ 518 for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) { 519 if (caches[i].present) { 520 KASSERT(caches[i].id_shift <= pkg_id_shift, 521 ("bug in APIC topology discovery")); 522 KASSERT(caches[i].id_shift >= core_id_shift, 523 ("bug in APIC topology discovery")); 524 525 topo_layers[nlayers].type = TOPO_TYPE_CACHE; 526 topo_layers[nlayers].subtype = i + 1; 527 topo_layers[nlayers].id_shift = caches[i].id_shift; 528 if (bootverbose) 529 printf("L%u cache ID shift: %u\n", 530 topo_layers[nlayers].subtype, 531 topo_layers[nlayers].id_shift); 532 nlayers++; 533 } 534 } 535 536 if (pkg_id_shift > core_id_shift) { 537 topo_layers[nlayers].type = TOPO_TYPE_CORE; 538 topo_layers[nlayers].id_shift = core_id_shift; 539 if (bootverbose) 540 printf("Core ID shift: %u\n", 541 topo_layers[nlayers].id_shift); 542 nlayers++; 543 } 544 545 topo_layers[nlayers].type = TOPO_TYPE_PU; 546 topo_layers[nlayers].id_shift = 0; 547 nlayers++; 548 549 topo_init_root(&topo_root); 550 for (i = 0; i <= MAX_APIC_ID; ++i) { 551 if (!cpu_info[i].cpu_present) 552 continue; 553 554 parent = &topo_root; 555 for (layer = 0; layer < nlayers; ++layer) { 556 node_id = i >> topo_layers[layer].id_shift; 557 parent = topo_add_node_by_hwid(parent, node_id, 558 topo_layers[layer].type, 559 topo_layers[layer].subtype); 560 } 561 } 562 563 parent = &topo_root; 564 for (layer = 0; layer < nlayers; ++layer) { 565 node_id = boot_cpu_id >> topo_layers[layer].id_shift; 566 node = topo_find_node_by_hwid(parent, node_id, 567 topo_layers[layer].type, 568 topo_layers[layer].subtype); 569 topo_promote_child(node); 570 parent = node; 571 } 572 573 cpu_topo_probed = 1; 574} 575 576/* 577 * Assign logical CPU IDs to local APICs. 578 */ 579void 580assign_cpu_ids(void) 581{ 582 struct topo_node *node; 583 u_int smt_mask; 584 585 smt_mask = (1u << core_id_shift) - 1; 586 587 /* 588 * Assign CPU IDs to local APIC IDs and disable any CPUs 589 * beyond MAXCPU. CPU 0 is always assigned to the BSP. 590 */ 591 mp_ncpus = 0; 592 TOPO_FOREACH(node, &topo_root) { 593 if (node->type != TOPO_TYPE_PU) 594 continue; 595 596 if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask)) 597 cpu_info[node->hwid].cpu_hyperthread = 1; 598 599 if (resource_disabled("lapic", node->hwid)) { 600 if (node->hwid != boot_cpu_id) 601 cpu_info[node->hwid].cpu_disabled = 1; 602 else 603 printf("Cannot disable BSP, APIC ID = %d\n", 604 node->hwid); 605 } 606 607 if (!hyperthreading_allowed && 608 cpu_info[node->hwid].cpu_hyperthread) 609 cpu_info[node->hwid].cpu_disabled = 1; 610 611 if (mp_ncpus >= MAXCPU) 612 cpu_info[node->hwid].cpu_disabled = 1; 613 614 if (cpu_info[node->hwid].cpu_disabled) { 615 disabled_cpus++; 616 continue; 617 } 618 619 cpu_apic_ids[mp_ncpus] = node->hwid; 620 apic_cpuids[node->hwid] = mp_ncpus; 621 topo_set_pu_id(node, mp_ncpus); 622 mp_ncpus++; 623 } 624 625 KASSERT(mp_maxid >= mp_ncpus - 1, 626 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 627 mp_ncpus)); 628} 629 630/* 631 * Print various information about the SMP system hardware and setup. 632 */ 633void 634cpu_mp_announce(void) 635{ 636 struct topo_node *node; 637 const char *hyperthread; 638 int pkg_count; 639 int cores_per_pkg; 640 int thrs_per_core; 641 642 printf("FreeBSD/SMP: "); 643 if (topo_analyze(&topo_root, 1, &pkg_count, 644 &cores_per_pkg, &thrs_per_core)) { 645 printf("%d package(s)", pkg_count); 646 if (cores_per_pkg > 0) 647 printf(" x %d core(s)", cores_per_pkg); 648 if (thrs_per_core > 1) 649 printf(" x %d hardware threads", thrs_per_core); 650 } else { 651 printf("Non-uniform topology"); 652 } 653 printf("\n"); 654 655 if (disabled_cpus) { 656 printf("FreeBSD/SMP Online: "); 657 if (topo_analyze(&topo_root, 0, &pkg_count, 658 &cores_per_pkg, &thrs_per_core)) { 659 printf("%d package(s)", pkg_count); 660 if (cores_per_pkg > 0) 661 printf(" x %d core(s)", cores_per_pkg); 662 if (thrs_per_core > 1) 663 printf(" x %d hardware threads", thrs_per_core); 664 } else { 665 printf("Non-uniform topology"); 666 } 667 printf("\n"); 668 } 669 670 if (!bootverbose) 671 return; 672 673 TOPO_FOREACH(node, &topo_root) { 674 switch (node->type) { 675 case TOPO_TYPE_PKG: 676 printf("Package HW ID = %u (%#x)\n", 677 node->hwid, node->hwid); 678 break; 679 case TOPO_TYPE_CORE: 680 printf("\tCore HW ID = %u (%#x)\n", 681 node->hwid, node->hwid); 682 break; 683 case TOPO_TYPE_PU: 684 if (cpu_info[node->hwid].cpu_hyperthread) 685 hyperthread = "/HT"; 686 else 687 hyperthread = ""; 688 689 if (node->subtype == 0) 690 printf("\t\tCPU (AP%s): APIC ID: %u (%#x)" 691 "(disabled)\n", hyperthread, node->hwid, 692 node->hwid); 693 else if (node->id == 0) 694 printf("\t\tCPU0 (BSP): APIC ID: %u (%#x)\n", 695 node->hwid, node->hwid); 696 else 697 printf("\t\tCPU%u (AP%s): APIC ID: %u (%#x)\n", 698 node->id, hyperthread, node->hwid, 699 node->hwid); 700 break; 701 default: 702 /* ignored */ 703 break; 704 } 705 } 706} 707 708/* 709 * Add a scheduling group, a group of logical processors sharing 710 * a particular cache (and, thus having an affinity), to the scheduling 711 * topology. 712 * This function recursively works on lower level caches. 713 */ 714static void 715x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root) 716{ 717 struct topo_node *node; 718 int nchildren; 719 int ncores; 720 int i; 721 722 KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE, 723 ("x86topo_add_sched_group: bad type: %u", root->type)); 724 CPU_COPY(&root->cpuset, &cg_root->cg_mask); 725 cg_root->cg_count = root->cpu_count; 726 if (root->type == TOPO_TYPE_SYSTEM) 727 cg_root->cg_level = CG_SHARE_NONE; 728 else 729 cg_root->cg_level = root->subtype; 730 731 /* 732 * Check how many core nodes we have under the given root node. 733 * If we have multiple logical processors, but not multiple 734 * cores, then those processors must be hardware threads. 735 */ 736 ncores = 0; 737 node = root; 738 while (node != NULL) { 739 if (node->type != TOPO_TYPE_CORE) { 740 node = topo_next_node(root, node); 741 continue; 742 } 743 744 ncores++; 745 node = topo_next_nonchild_node(root, node); 746 } 747 748 if (cg_root->cg_level != CG_SHARE_NONE && 749 root->cpu_count > 1 && ncores < 2) 750 cg_root->cg_flags = CG_FLAG_SMT; 751 752 /* 753 * Find out how many cache nodes we have under the given root node. 754 * We ignore cache nodes that cover all the same processors as the 755 * root node. Also, we do not descend below found cache nodes. 756 * That is, we count top-level "non-redundant" caches under the root 757 * node. 758 */ 759 nchildren = 0; 760 node = root; 761 while (node != NULL) { 762 if (node->type != TOPO_TYPE_CACHE || 763 (root->type != TOPO_TYPE_SYSTEM && 764 CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { 765 node = topo_next_node(root, node); 766 continue; 767 } 768 nchildren++; 769 node = topo_next_nonchild_node(root, node); 770 } 771 772 cg_root->cg_child = smp_topo_alloc(nchildren); 773 cg_root->cg_children = nchildren; 774 775 /* 776 * Now find again the same cache nodes as above and recursively 777 * build scheduling topologies for them. 778 */ 779 node = root; 780 i = 0; 781 while (node != NULL) { 782 if (node->type != TOPO_TYPE_CACHE || 783 (root->type != TOPO_TYPE_SYSTEM && 784 CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { 785 node = topo_next_node(root, node); 786 continue; 787 } 788 cg_root->cg_child[i].cg_parent = cg_root; 789 x86topo_add_sched_group(node, &cg_root->cg_child[i]); 790 i++; 791 node = topo_next_nonchild_node(root, node); 792 } 793} 794 795/* 796 * Build the MI scheduling topology from the discovered hardware topology. 797 */ 798struct cpu_group * 799cpu_topo(void) 800{ 801 struct cpu_group *cg_root; 802 803 if (mp_ncpus <= 1) 804 return (smp_topo_none()); 805 806 cg_root = smp_topo_alloc(1); 807 x86topo_add_sched_group(&topo_root, cg_root); 808 return (cg_root); 809} 810 811 812/* 813 * Add a logical CPU to the topology. 814 */ 815void 816cpu_add(u_int apic_id, char boot_cpu) 817{ 818 819 if (apic_id > MAX_APIC_ID) { 820 panic("SMP: APIC ID %d too high", apic_id); 821 return; 822 } 823 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", 824 apic_id)); 825 cpu_info[apic_id].cpu_present = 1; 826 if (boot_cpu) { 827 KASSERT(boot_cpu_id == -1, 828 ("CPU %d claims to be BSP, but CPU %d already is", apic_id, 829 boot_cpu_id)); 830 boot_cpu_id = apic_id; 831 cpu_info[apic_id].cpu_bsp = 1; 832 } 833 if (mp_ncpus < MAXCPU) { 834 mp_ncpus++; 835 mp_maxid = mp_ncpus - 1; 836 } 837 if (bootverbose) 838 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : 839 "AP"); 840} 841 842void 843cpu_mp_setmaxid(void) 844{ 845 846 /* 847 * mp_ncpus and mp_maxid should be already set by calls to cpu_add(). 848 * If there were no calls to cpu_add() assume this is a UP system. 849 */ 850 if (mp_ncpus == 0) 851 mp_ncpus = 1; 852} 853 854int 855cpu_mp_probe(void) 856{ 857 858 /* 859 * Always record BSP in CPU map so that the mbuf init code works 860 * correctly. 861 */ 862 CPU_SETOF(0, &all_cpus); 863 return (mp_ncpus > 1); 864} 865 866/* 867 * AP CPU's call this to initialize themselves. 868 */ 869void 870init_secondary_tail(void) 871{ 872 u_int cpuid; 873 874 /* 875 * On real hardware, switch to x2apic mode if possible. Do it 876 * after aps_ready was signalled, to avoid manipulating the 877 * mode while BSP might still want to send some IPI to us 878 * (second startup IPI is ignored on modern hardware etc). 879 */ 880 lapic_xapic_mode(); 881 882 /* Initialize the PAT MSR. */ 883 pmap_init_pat(); 884 885 /* set up CPU registers and state */ 886 cpu_setregs(); 887 888 /* set up SSE/NX */ 889 initializecpu(); 890 891 /* set up FPU state on the AP */ 892#ifdef __amd64__ 893 fpuinit(); 894#else 895 npxinit(false); 896#endif 897 898 if (cpu_ops.cpu_init) 899 cpu_ops.cpu_init(); 900 901 /* A quick check from sanity claus */ 902 cpuid = PCPU_GET(cpuid); 903 if (PCPU_GET(apic_id) != lapic_id()) { 904 printf("SMP: cpuid = %d\n", cpuid); 905 printf("SMP: actual apic_id = %d\n", lapic_id()); 906 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 907 panic("cpuid mismatch! boom!!"); 908 } 909 910 /* Initialize curthread. */ 911 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 912 PCPU_SET(curthread, PCPU_GET(idlethread)); 913 914 mca_init(); 915 916 mtx_lock_spin(&ap_boot_mtx); 917 918 /* Init local apic for irq's */ 919 lapic_setup(1); 920 921 /* Set memory range attributes for this CPU to match the BSP */ 922 mem_range_AP_init(); 923 924 smp_cpus++; 925 926 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); 927 printf("SMP: AP CPU #%d Launched!\n", cpuid); 928 929 /* Determine if we are a logical CPU. */ 930 if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread) 931 CPU_SET(cpuid, &logical_cpus_mask); 932 933 if (bootverbose) 934 lapic_dump("AP"); 935 936 if (smp_cpus == mp_ncpus) { 937 /* enable IPI's, tlb shootdown, freezes etc */ 938 atomic_store_rel_int(&smp_started, 1); 939 } 940 941#ifdef __amd64__ 942 /* 943 * Enable global pages TLB extension 944 * This also implicitly flushes the TLB 945 */ 946 load_cr4(rcr4() | CR4_PGE); 947 if (pmap_pcid_enabled) 948 load_cr4(rcr4() | CR4_PCIDE); 949 load_ds(_udatasel); 950 load_es(_udatasel); 951 load_fs(_ufssel); 952#endif 953 954 mtx_unlock_spin(&ap_boot_mtx); 955 956 /* Wait until all the AP's are up. */ 957 while (atomic_load_acq_int(&smp_started) == 0) 958 ia32_pause(); 959 960#ifndef EARLY_AP_STARTUP 961 /* Start per-CPU event timers. */ 962 cpu_initclocks_ap(); 963#endif 964 965 sched_throw(NULL); 966 967 panic("scheduler returned us to %s", __func__); 968 /* NOTREACHED */ 969} 970 971/******************************************************************* 972 * local functions and data 973 */ 974 975/* 976 * We tell the I/O APIC code about all the CPUs we want to receive 977 * interrupts. If we don't want certain CPUs to receive IRQs we 978 * can simply not tell the I/O APIC code about them in this function. 979 * We also do not tell it about the BSP since it tells itself about 980 * the BSP internally to work with UP kernels and on UP machines. 981 */ 982void 983set_interrupt_apic_ids(void) 984{ 985 u_int i, apic_id; 986 987 for (i = 0; i < MAXCPU; i++) { 988 apic_id = cpu_apic_ids[i]; 989 if (apic_id == -1) 990 continue; 991 if (cpu_info[apic_id].cpu_bsp) 992 continue; 993 if (cpu_info[apic_id].cpu_disabled) 994 continue; 995 996 /* Don't let hyperthreads service interrupts. */ 997 if (cpu_info[apic_id].cpu_hyperthread) 998 continue; 999 1000 intr_add_cpu(i); 1001 } 1002} 1003 1004 1005#ifdef COUNT_XINVLTLB_HITS 1006u_int xhits_gbl[MAXCPU]; 1007u_int xhits_pg[MAXCPU]; 1008u_int xhits_rng[MAXCPU]; 1009static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); 1010SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, 1011 sizeof(xhits_gbl), "IU", ""); 1012SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, 1013 sizeof(xhits_pg), "IU", ""); 1014SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, 1015 sizeof(xhits_rng), "IU", ""); 1016 1017u_int ipi_global; 1018u_int ipi_page; 1019u_int ipi_range; 1020u_int ipi_range_size; 1021SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); 1022SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); 1023SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); 1024SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 1025 0, ""); 1026#endif /* COUNT_XINVLTLB_HITS */ 1027 1028/* 1029 * Init and startup IPI. 1030 */ 1031void 1032ipi_startup(int apic_id, int vector) 1033{ 1034 1035 /* 1036 * This attempts to follow the algorithm described in the 1037 * Intel Multiprocessor Specification v1.4 in section B.4. 1038 * For each IPI, we allow the local APIC ~20us to deliver the 1039 * IPI. If that times out, we panic. 1040 */ 1041 1042 /* 1043 * first we do an INIT IPI: this INIT IPI might be run, resetting 1044 * and running the target CPU. OR this INIT IPI might be latched (P5 1045 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 1046 * ignored. 1047 */ 1048 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1049 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 1050 lapic_ipi_wait(100); 1051 1052 /* Explicitly deassert the INIT IPI. */ 1053 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1054 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 1055 apic_id); 1056 1057 DELAY(10000); /* wait ~10mS */ 1058 1059 /* 1060 * next we do a STARTUP IPI: the previous INIT IPI might still be 1061 * latched, (P5 bug) this 1st STARTUP would then terminate 1062 * immediately, and the previously started INIT IPI would continue. OR 1063 * the previous INIT IPI has already run. and this STARTUP IPI will 1064 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 1065 * will run. 1066 */ 1067 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1068 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1069 vector, apic_id); 1070 if (!lapic_ipi_wait(100)) 1071 panic("Failed to deliver first STARTUP IPI to APIC %d", 1072 apic_id); 1073 DELAY(200); /* wait ~200uS */ 1074 1075 /* 1076 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 1077 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 1078 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 1079 * recognized after hardware RESET or INIT IPI. 1080 */ 1081 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1082 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1083 vector, apic_id); 1084 if (!lapic_ipi_wait(100)) 1085 panic("Failed to deliver second STARTUP IPI to APIC %d", 1086 apic_id); 1087 1088 DELAY(200); /* wait ~200uS */ 1089} 1090 1091/* 1092 * Send an IPI to specified CPU handling the bitmap logic. 1093 */ 1094void 1095ipi_send_cpu(int cpu, u_int ipi) 1096{ 1097 u_int bitmap, old_pending, new_pending; 1098 1099 KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); 1100 1101 if (IPI_IS_BITMAPED(ipi)) { 1102 bitmap = 1 << ipi; 1103 ipi = IPI_BITMAP_VECTOR; 1104 do { 1105 old_pending = cpu_ipi_pending[cpu]; 1106 new_pending = old_pending | bitmap; 1107 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], 1108 old_pending, new_pending)); 1109 if (old_pending) 1110 return; 1111 } 1112 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); 1113} 1114 1115void 1116ipi_bitmap_handler(struct trapframe frame) 1117{ 1118 struct trapframe *oldframe; 1119 struct thread *td; 1120 int cpu = PCPU_GET(cpuid); 1121 u_int ipi_bitmap; 1122 1123 critical_enter(); 1124 td = curthread; 1125 td->td_intr_nesting_level++; 1126 oldframe = td->td_intr_frame; 1127 td->td_intr_frame = &frame; 1128 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 1129 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 1130#ifdef COUNT_IPIS 1131 (*ipi_preempt_counts[cpu])++; 1132#endif 1133 sched_preempt(td); 1134 } 1135 if (ipi_bitmap & (1 << IPI_AST)) { 1136#ifdef COUNT_IPIS 1137 (*ipi_ast_counts[cpu])++; 1138#endif 1139 /* Nothing to do for AST */ 1140 } 1141 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { 1142#ifdef COUNT_IPIS 1143 (*ipi_hardclock_counts[cpu])++; 1144#endif 1145 hardclockintr(); 1146 } 1147 td->td_intr_frame = oldframe; 1148 td->td_intr_nesting_level--; 1149 critical_exit(); 1150} 1151 1152/* 1153 * send an IPI to a set of cpus. 1154 */ 1155void 1156ipi_selected(cpuset_t cpus, u_int ipi) 1157{ 1158 int cpu; 1159 1160 /* 1161 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1162 * of help in order to understand what is the source. 1163 * Set the mask of receiving CPUs for this purpose. 1164 */ 1165 if (ipi == IPI_STOP_HARD) 1166 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus); 1167 1168 while ((cpu = CPU_FFS(&cpus)) != 0) { 1169 cpu--; 1170 CPU_CLR(cpu, &cpus); 1171 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1172 ipi_send_cpu(cpu, ipi); 1173 } 1174} 1175 1176/* 1177 * send an IPI to a specific CPU. 1178 */ 1179void 1180ipi_cpu(int cpu, u_int ipi) 1181{ 1182 1183 /* 1184 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1185 * of help in order to understand what is the source. 1186 * Set the mask of receiving CPUs for this purpose. 1187 */ 1188 if (ipi == IPI_STOP_HARD) 1189 CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending); 1190 1191 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1192 ipi_send_cpu(cpu, ipi); 1193} 1194 1195/* 1196 * send an IPI to all CPUs EXCEPT myself 1197 */ 1198void 1199ipi_all_but_self(u_int ipi) 1200{ 1201 cpuset_t other_cpus; 1202 1203 other_cpus = all_cpus; 1204 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1205 if (IPI_IS_BITMAPED(ipi)) { 1206 ipi_selected(other_cpus, ipi); 1207 return; 1208 } 1209 1210 /* 1211 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1212 * of help in order to understand what is the source. 1213 * Set the mask of receiving CPUs for this purpose. 1214 */ 1215 if (ipi == IPI_STOP_HARD) 1216 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus); 1217 1218 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1219 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 1220} 1221 1222int 1223ipi_nmi_handler(void) 1224{ 1225 u_int cpuid; 1226 1227 /* 1228 * As long as there is not a simple way to know about a NMI's 1229 * source, if the bitmask for the current CPU is present in 1230 * the global pending bitword an IPI_STOP_HARD has been issued 1231 * and should be handled. 1232 */ 1233 cpuid = PCPU_GET(cpuid); 1234 if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending)) 1235 return (1); 1236 1237 CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending); 1238 cpustop_handler(); 1239 return (0); 1240} 1241 1242#ifdef DEV_ISA 1243int nmi_kdb_lock; 1244 1245void 1246nmi_call_kdb_smp(u_int type, struct trapframe *frame) 1247{ 1248 int cpu; 1249 bool call_post; 1250 1251 cpu = PCPU_GET(cpuid); 1252 if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) { 1253 nmi_call_kdb(cpu, type, frame); 1254 call_post = false; 1255 } else { 1256 savectx(&stoppcbs[cpu]); 1257 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1258 while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) 1259 ia32_pause(); 1260 call_post = true; 1261 } 1262 atomic_store_rel_int(&nmi_kdb_lock, 0); 1263 if (call_post) 1264 cpustop_handler_post(cpu); 1265} 1266#endif 1267 1268/* 1269 * Handle an IPI_STOP by saving our current context and spinning until we 1270 * are resumed. 1271 */ 1272void 1273cpustop_handler(void) 1274{ 1275 u_int cpu; 1276 1277 cpu = PCPU_GET(cpuid); 1278 1279 savectx(&stoppcbs[cpu]); 1280 1281 /* Indicate that we are stopped */ 1282 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1283 1284 /* Wait for restart */ 1285 while (!CPU_ISSET(cpu, &started_cpus)) 1286 ia32_pause(); 1287 1288 cpustop_handler_post(cpu); 1289} 1290 1291static void 1292cpustop_handler_post(u_int cpu) 1293{ 1294 1295 CPU_CLR_ATOMIC(cpu, &started_cpus); 1296 CPU_CLR_ATOMIC(cpu, &stopped_cpus); 1297 1298#if defined(__amd64__) && defined(DDB) 1299 amd64_db_resume_dbreg(); 1300#endif 1301 1302 if (cpu == 0 && cpustop_restartfunc != NULL) { 1303 cpustop_restartfunc(); 1304 cpustop_restartfunc = NULL; 1305 } 1306} 1307 1308/* 1309 * Handle an IPI_SUSPEND by saving our current context and spinning until we 1310 * are resumed. 1311 */ 1312void 1313cpususpend_handler(void) 1314{ 1315 u_int cpu; 1316 1317 mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); 1318 1319 cpu = PCPU_GET(cpuid); 1320 if (savectx(&susppcbs[cpu]->sp_pcb)) { 1321#ifdef __amd64__ 1322 fpususpend(susppcbs[cpu]->sp_fpususpend); 1323#else 1324 npxsuspend(susppcbs[cpu]->sp_fpususpend); 1325#endif 1326 wbinvd(); 1327 CPU_SET_ATOMIC(cpu, &suspended_cpus); 1328 } else { 1329#ifdef __amd64__ 1330 fpuresume(susppcbs[cpu]->sp_fpususpend); 1331#else 1332 npxresume(susppcbs[cpu]->sp_fpususpend); 1333#endif 1334 pmap_init_pat(); 1335 initializecpu(); 1336 PCPU_SET(switchtime, 0); 1337 PCPU_SET(switchticks, ticks); 1338 1339 /* Indicate that we are resumed */ 1340 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1341 } 1342 1343 /* Wait for resume */ 1344 while (!CPU_ISSET(cpu, &started_cpus)) 1345 ia32_pause(); 1346 1347 if (cpu_ops.cpu_resume) 1348 cpu_ops.cpu_resume(); 1349#ifdef __amd64__ 1350 if (vmm_resume_p) 1351 vmm_resume_p(); 1352#endif 1353 1354 /* Resume MCA and local APIC */ 1355 lapic_xapic_mode(); 1356 mca_resume(); 1357 lapic_setup(0); 1358 1359 /* Indicate that we are resumed */ 1360 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1361 CPU_CLR_ATOMIC(cpu, &started_cpus); 1362} 1363 1364 1365void 1366invlcache_handler(void) 1367{ 1368 uint32_t generation; 1369 1370#ifdef COUNT_IPIS 1371 (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; 1372#endif /* COUNT_IPIS */ 1373 1374 /* 1375 * Reading the generation here allows greater parallelism 1376 * since wbinvd is a serializing instruction. Without the 1377 * temporary, we'd wait for wbinvd to complete, then the read 1378 * would execute, then the dependent write, which must then 1379 * complete before return from interrupt. 1380 */ 1381 generation = smp_tlb_generation; 1382 wbinvd(); 1383 PCPU_SET(smp_tlb_done, generation); 1384} 1385 1386/* 1387 * This is called once the rest of the system is up and running and we're 1388 * ready to let the AP's out of the pen. 1389 */ 1390static void 1391release_aps(void *dummy __unused) 1392{ 1393 1394 if (mp_ncpus == 1) 1395 return; 1396 atomic_store_rel_int(&aps_ready, 1); 1397 while (smp_started == 0) 1398 ia32_pause(); 1399} 1400SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1401 1402#ifdef COUNT_IPIS 1403/* 1404 * Setup interrupt counters for IPI handlers. 1405 */ 1406static void 1407mp_ipi_intrcnt(void *dummy) 1408{ 1409 char buf[64]; 1410 int i; 1411 1412 CPU_FOREACH(i) { 1413 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); 1414 intrcnt_add(buf, &ipi_invltlb_counts[i]); 1415 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); 1416 intrcnt_add(buf, &ipi_invlrng_counts[i]); 1417 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); 1418 intrcnt_add(buf, &ipi_invlpg_counts[i]); 1419 snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); 1420 intrcnt_add(buf, &ipi_invlcache_counts[i]); 1421 snprintf(buf, sizeof(buf), "cpu%d:preempt", i); 1422 intrcnt_add(buf, &ipi_preempt_counts[i]); 1423 snprintf(buf, sizeof(buf), "cpu%d:ast", i); 1424 intrcnt_add(buf, &ipi_ast_counts[i]); 1425 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); 1426 intrcnt_add(buf, &ipi_rendezvous_counts[i]); 1427 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); 1428 intrcnt_add(buf, &ipi_hardclock_counts[i]); 1429 } 1430} 1431SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); 1432#endif 1433 1434/* 1435 * Flush the TLB on other CPU's 1436 */ 1437 1438/* Variables needed for SMP tlb shootdown. */ 1439static vm_offset_t smp_tlb_addr1, smp_tlb_addr2; 1440pmap_t smp_tlb_pmap; 1441volatile uint32_t smp_tlb_generation; 1442 1443#ifdef __amd64__ 1444#define read_eflags() read_rflags() 1445#endif 1446 1447static void 1448smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, 1449 vm_offset_t addr1, vm_offset_t addr2) 1450{ 1451 cpuset_t other_cpus; 1452 volatile uint32_t *p_cpudone; 1453 uint32_t generation; 1454 int cpu; 1455 1456 /* 1457 * Check for other cpus. Return if none. 1458 */ 1459 if (CPU_ISFULLSET(&mask)) { 1460 if (mp_ncpus <= 1) 1461 return; 1462 } else { 1463 CPU_CLR(PCPU_GET(cpuid), &mask); 1464 if (CPU_EMPTY(&mask)) 1465 return; 1466 } 1467 1468 if (!(read_eflags() & PSL_I)) 1469 panic("%s: interrupts disabled", __func__); 1470 mtx_lock_spin(&smp_ipi_mtx); 1471 smp_tlb_addr1 = addr1; 1472 smp_tlb_addr2 = addr2; 1473 smp_tlb_pmap = pmap; 1474 generation = ++smp_tlb_generation; 1475 if (CPU_ISFULLSET(&mask)) { 1476 ipi_all_but_self(vector); 1477 other_cpus = all_cpus; 1478 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1479 } else { 1480 other_cpus = mask; 1481 while ((cpu = CPU_FFS(&mask)) != 0) { 1482 cpu--; 1483 CPU_CLR(cpu, &mask); 1484 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, 1485 cpu, vector); 1486 ipi_send_cpu(cpu, vector); 1487 } 1488 } 1489 while ((cpu = CPU_FFS(&other_cpus)) != 0) { 1490 cpu--; 1491 CPU_CLR(cpu, &other_cpus); 1492 p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done; 1493 while (*p_cpudone != generation) 1494 ia32_pause(); 1495 } 1496 mtx_unlock_spin(&smp_ipi_mtx); 1497} 1498 1499void 1500smp_masked_invltlb(cpuset_t mask, pmap_t pmap) 1501{ 1502 1503 if (smp_started) { 1504 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0); 1505#ifdef COUNT_XINVLTLB_HITS 1506 ipi_global++; 1507#endif 1508 } 1509} 1510 1511void 1512smp_masked_invlpg(cpuset_t mask, vm_offset_t addr) 1513{ 1514 1515 if (smp_started) { 1516 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, NULL, addr, 0); 1517#ifdef COUNT_XINVLTLB_HITS 1518 ipi_page++; 1519#endif 1520 } 1521} 1522 1523void 1524smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2) 1525{ 1526 1527 if (smp_started) { 1528 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, NULL, 1529 addr1, addr2); 1530#ifdef COUNT_XINVLTLB_HITS 1531 ipi_range++; 1532 ipi_range_size += (addr2 - addr1) / PAGE_SIZE; 1533#endif 1534 } 1535} 1536 1537void 1538smp_cache_flush(void) 1539{ 1540 1541 if (smp_started) { 1542 smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL, 1543 0, 0); 1544 } 1545} 1546 1547/* 1548 * Handlers for TLB related IPIs 1549 */ 1550void 1551invltlb_handler(void) 1552{ 1553 uint32_t generation; 1554 1555#ifdef COUNT_XINVLTLB_HITS 1556 xhits_gbl[PCPU_GET(cpuid)]++; 1557#endif /* COUNT_XINVLTLB_HITS */ 1558#ifdef COUNT_IPIS 1559 (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; 1560#endif /* COUNT_IPIS */ 1561 1562 /* 1563 * Reading the generation here allows greater parallelism 1564 * since invalidating the TLB is a serializing operation. 1565 */ 1566 generation = smp_tlb_generation; 1567 if (smp_tlb_pmap == kernel_pmap) 1568 invltlb_glob(); 1569 else 1570 invltlb(); 1571 PCPU_SET(smp_tlb_done, generation); 1572} 1573 1574void 1575invlpg_handler(void) 1576{ 1577 uint32_t generation; 1578 1579#ifdef COUNT_XINVLTLB_HITS 1580 xhits_pg[PCPU_GET(cpuid)]++; 1581#endif /* COUNT_XINVLTLB_HITS */ 1582#ifdef COUNT_IPIS 1583 (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; 1584#endif /* COUNT_IPIS */ 1585 1586 generation = smp_tlb_generation; /* Overlap with serialization */ 1587 invlpg(smp_tlb_addr1); 1588 PCPU_SET(smp_tlb_done, generation); 1589} 1590 1591void 1592invlrng_handler(void) 1593{ 1594 vm_offset_t addr, addr2; 1595 uint32_t generation; 1596 1597#ifdef COUNT_XINVLTLB_HITS 1598 xhits_rng[PCPU_GET(cpuid)]++; 1599#endif /* COUNT_XINVLTLB_HITS */ 1600#ifdef COUNT_IPIS 1601 (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; 1602#endif /* COUNT_IPIS */ 1603 1604 addr = smp_tlb_addr1; 1605 addr2 = smp_tlb_addr2; 1606 generation = smp_tlb_generation; /* Overlap with serialization */ 1607 do { 1608 invlpg(addr); 1609 addr += PAGE_SIZE; 1610 } while (addr < addr2); 1611 1612 PCPU_SET(smp_tlb_done, generation); 1613} 1614