1/*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2003, by Peter Wemm 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: stable/11/sys/x86/x86/mp_x86.c 349958 2019-07-12 22:31:12Z jhb $"); 29 30#ifdef __i386__ 31#include "opt_apic.h" 32#endif 33#include "opt_cpu.h" 34#include "opt_kstack_pages.h" 35#include "opt_pmap.h" 36#include "opt_sched.h" 37#include "opt_smp.h" 38 39#include <sys/param.h> 40#include <sys/systm.h> 41#include <sys/bus.h> 42#include <sys/cons.h> /* cngetc() */ 43#include <sys/cpuset.h> 44#ifdef GPROF 45#include <sys/gmon.h> 46#endif 47#include <sys/kernel.h> 48#include <sys/ktr.h> 49#include <sys/lock.h> 50#include <sys/malloc.h> 51#include <sys/memrange.h> 52#include <sys/mutex.h> 53#include <sys/pcpu.h> 54#include <sys/proc.h> 55#include <sys/sched.h> 56#include <sys/smp.h> 57#include <sys/sysctl.h> 58 59#include <vm/vm.h> 60#include <vm/vm_param.h> 61#include <vm/pmap.h> 62#include <vm/vm_kern.h> 63#include <vm/vm_extern.h> 64#include <vm/vm_map.h> 65 66#include <x86/apicreg.h> 67#include <machine/clock.h> 68#include <machine/cpu.h> 69#include <machine/cputypes.h> 70#include <x86/mca.h> 71#include <machine/md_var.h> 72#include <machine/pcb.h> 73#include <machine/psl.h> 74#include <machine/smp.h> 75#include <machine/specialreg.h> 76#include <x86/ucode.h> 77 78/* lock region used by kernel profiling */ 79int mcount_lock; 80 81int mp_naps; /* # of Applications processors */ 82int boot_cpu_id = -1; /* designated BSP */ 83 84extern struct pcpu __pcpu[]; 85 86/* AP uses this during bootstrap. Do not staticize. */ 87char *bootSTK; 88int bootAP; 89 90/* Free these after use */ 91void *bootstacks[MAXCPU]; 92void *dpcpu; 93 94struct pcb stoppcbs[MAXCPU]; 95struct susppcb **susppcbs; 96 97#ifdef COUNT_IPIS 98/* Interrupt counts. */ 99static u_long *ipi_preempt_counts[MAXCPU]; 100static u_long *ipi_ast_counts[MAXCPU]; 101u_long *ipi_invltlb_counts[MAXCPU]; 102u_long *ipi_invlrng_counts[MAXCPU]; 103u_long *ipi_invlpg_counts[MAXCPU]; 104u_long *ipi_invlcache_counts[MAXCPU]; 105u_long *ipi_rendezvous_counts[MAXCPU]; 106static u_long *ipi_hardclock_counts[MAXCPU]; 107#endif 108 109/* Default cpu_ops implementation. */ 110struct cpu_ops cpu_ops; 111 112/* 113 * Local data and functions. 114 */ 115 116static volatile cpuset_t ipi_stop_nmi_pending; 117 118volatile cpuset_t resuming_cpus; 119volatile cpuset_t toresume_cpus; 120 121/* used to hold the AP's until we are ready to release them */ 122struct mtx ap_boot_mtx; 123 124/* Set to 1 once we're ready to let the APs out of the pen. */ 125volatile int aps_ready = 0; 126 127/* 128 * Store data from cpu_add() until later in the boot when we actually setup 129 * the APs. 130 */ 131struct cpu_info cpu_info[MAX_APIC_ID + 1]; 132int apic_cpuids[MAX_APIC_ID + 1]; 133int cpu_apic_ids[MAXCPU]; 134 135/* Holds pending bitmap based IPIs per CPU */ 136volatile u_int cpu_ipi_pending[MAXCPU]; 137 138static void release_aps(void *dummy); 139static void cpustop_handler_post(u_int cpu); 140 141static int hyperthreading_allowed = 1; 142SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN, 143 &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs"); 144 145static struct topo_node topo_root; 146 147static int pkg_id_shift; 148static int core_id_shift; 149static int disabled_cpus; 150 151struct cache_info { 152 int id_shift; 153 int present; 154} static caches[MAX_CACHE_LEVELS]; 155 156void 157mem_range_AP_init(void) 158{ 159 160 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 161 mem_range_softc.mr_op->initAP(&mem_range_softc); 162} 163 164/* 165 * Round up to the next power of two, if necessary, and then 166 * take log2. 167 * Returns -1 if argument is zero. 168 */ 169static __inline int 170mask_width(u_int x) 171{ 172 173 return (fls(x << (1 - powerof2(x))) - 1); 174} 175 176/* 177 * Add a cache level to the cache topology description. 178 */ 179static int 180add_deterministic_cache(int type, int level, int share_count) 181{ 182 183 if (type == 0) 184 return (0); 185 if (type > 3) { 186 printf("unexpected cache type %d\n", type); 187 return (1); 188 } 189 if (type == 2) /* ignore instruction cache */ 190 return (1); 191 if (level == 0 || level > MAX_CACHE_LEVELS) { 192 printf("unexpected cache level %d\n", type); 193 return (1); 194 } 195 196 if (caches[level - 1].present) { 197 printf("WARNING: multiple entries for L%u data cache\n", level); 198 printf("%u => %u\n", caches[level - 1].id_shift, 199 mask_width(share_count)); 200 } 201 caches[level - 1].id_shift = mask_width(share_count); 202 caches[level - 1].present = 1; 203 204 if (caches[level - 1].id_shift > pkg_id_shift) { 205 printf("WARNING: L%u data cache covers more " 206 "APIC IDs than a package\n", level); 207 printf("%u > %u\n", caches[level - 1].id_shift, pkg_id_shift); 208 caches[level - 1].id_shift = pkg_id_shift; 209 } 210 if (caches[level - 1].id_shift < core_id_shift) { 211 printf("WARNING: L%u data cache covers less " 212 "APIC IDs than a core\n", level); 213 printf("%u < %u\n", caches[level - 1].id_shift, core_id_shift); 214 caches[level - 1].id_shift = core_id_shift; 215 } 216 217 return (1); 218} 219 220/* 221 * Determine topology of processing units and caches for AMD CPUs. 222 * See: 223 * - AMD CPUID Specification (Publication # 25481) 224 * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559) 225 * - BKDG For AMD Family 10h Processors (Publication # 31116) 226 * - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301) 227 * - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751) 228 * - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945) 229 */ 230static void 231topo_probe_amd(void) 232{ 233 u_int p[4]; 234 uint64_t v; 235 int level; 236 int nodes_per_socket; 237 int share_count; 238 int type; 239 int i; 240 241 /* No multi-core capability. */ 242 if ((amd_feature2 & AMDID2_CMP) == 0) 243 return; 244 245 /* For families 10h and newer. */ 246 pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >> 247 AMDID_COREID_SIZE_SHIFT; 248 249 /* For 0Fh family. */ 250 if (pkg_id_shift == 0) 251 pkg_id_shift = 252 mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1); 253 254 /* 255 * Families prior to 16h define the following value as 256 * cores per compute unit and we don't really care about the AMD 257 * compute units at the moment. Perhaps we should treat them as 258 * cores and cores within the compute units as hardware threads, 259 * but that's up for debate. 260 * Later families define the value as threads per compute unit, 261 * so we are following AMD's nomenclature here. 262 */ 263 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 && 264 CPUID_TO_FAMILY(cpu_id) >= 0x16) { 265 cpuid_count(0x8000001e, 0, p); 266 share_count = ((p[1] >> 8) & 0xff) + 1; 267 core_id_shift = mask_width(share_count); 268 } 269 270 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) { 271 for (i = 0; ; i++) { 272 cpuid_count(0x8000001d, i, p); 273 type = p[0] & 0x1f; 274 level = (p[0] >> 5) & 0x7; 275 share_count = 1 + ((p[0] >> 14) & 0xfff); 276 277 if (!add_deterministic_cache(type, level, share_count)) 278 break; 279 } 280 } else { 281 if (cpu_exthigh >= 0x80000005) { 282 cpuid_count(0x80000005, 0, p); 283 if (((p[2] >> 24) & 0xff) != 0) { 284 caches[0].id_shift = 0; 285 caches[0].present = 1; 286 } 287 } 288 if (cpu_exthigh >= 0x80000006) { 289 cpuid_count(0x80000006, 0, p); 290 if (((p[2] >> 16) & 0xffff) != 0) { 291 caches[1].id_shift = 0; 292 caches[1].present = 1; 293 } 294 if (((p[3] >> 18) & 0x3fff) != 0) { 295 nodes_per_socket = 1; 296 if ((amd_feature2 & AMDID2_NODE_ID) != 0) { 297 /* 298 * Handle multi-node processors that 299 * have multiple chips, each with its 300 * own L3 cache, on the same die. 301 */ 302 v = rdmsr(0xc001100c); 303 nodes_per_socket = 1 + ((v >> 3) & 0x7); 304 } 305 caches[2].id_shift = 306 pkg_id_shift - mask_width(nodes_per_socket); 307 caches[2].present = 1; 308 } 309 } 310 } 311} 312 313/* 314 * Determine topology of processing units for Intel CPUs 315 * using CPUID Leaf 1 and Leaf 4, if supported. 316 * See: 317 * - Intel 64 Architecture Processor Topology Enumeration 318 * - Intel 64 and IA-32 ArchitecturesSoftware Developer���s Manual, 319 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 320 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 321 */ 322static void 323topo_probe_intel_0x4(void) 324{ 325 u_int p[4]; 326 int max_cores; 327 int max_logical; 328 329 /* Both zero and one here mean one logical processor per package. */ 330 max_logical = (cpu_feature & CPUID_HTT) != 0 ? 331 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; 332 if (max_logical <= 1) 333 return; 334 335 if (cpu_high >= 0x4) { 336 cpuid_count(0x04, 0, p); 337 max_cores = ((p[0] >> 26) & 0x3f) + 1; 338 } else 339 max_cores = 1; 340 341 core_id_shift = mask_width(max_logical/max_cores); 342 KASSERT(core_id_shift >= 0, 343 ("intel topo: max_cores > max_logical\n")); 344 pkg_id_shift = core_id_shift + mask_width(max_cores); 345} 346 347/* 348 * Determine topology of processing units for Intel CPUs 349 * using CPUID Leaf 11, if supported. 350 * See: 351 * - Intel 64 Architecture Processor Topology Enumeration 352 * - Intel 64 and IA-32 ArchitecturesSoftware Developer���s Manual, 353 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 354 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 355 */ 356static void 357topo_probe_intel_0xb(void) 358{ 359 u_int p[4]; 360 int bits; 361 int type; 362 int i; 363 364 /* Fall back if CPU leaf 11 doesn't really exist. */ 365 cpuid_count(0x0b, 0, p); 366 if (p[1] == 0) { 367 topo_probe_intel_0x4(); 368 return; 369 } 370 371 /* We only support three levels for now. */ 372 for (i = 0; ; i++) { 373 cpuid_count(0x0b, i, p); 374 375 bits = p[0] & 0x1f; 376 type = (p[2] >> 8) & 0xff; 377 378 if (type == 0) 379 break; 380 381 /* TODO: check for duplicate (re-)assignment */ 382 if (type == CPUID_TYPE_SMT) 383 core_id_shift = bits; 384 else if (type == CPUID_TYPE_CORE) 385 pkg_id_shift = bits; 386 else 387 printf("unknown CPU level type %d\n", type); 388 } 389 390 if (pkg_id_shift < core_id_shift) { 391 printf("WARNING: core covers more APIC IDs than a package\n"); 392 core_id_shift = pkg_id_shift; 393 } 394} 395 396/* 397 * Determine topology of caches for Intel CPUs. 398 * See: 399 * - Intel 64 Architecture Processor Topology Enumeration 400 * - Intel 64 and IA-32 Architectures Software Developer���s Manual 401 * Volume 2A: Instruction Set Reference, A-M, 402 * CPUID instruction 403 */ 404static void 405topo_probe_intel_caches(void) 406{ 407 u_int p[4]; 408 int level; 409 int share_count; 410 int type; 411 int i; 412 413 if (cpu_high < 0x4) { 414 /* 415 * Available cache level and sizes can be determined 416 * via CPUID leaf 2, but that requires a huge table of hardcoded 417 * values, so for now just assume L1 and L2 caches potentially 418 * shared only by HTT processing units, if HTT is present. 419 */ 420 caches[0].id_shift = pkg_id_shift; 421 caches[0].present = 1; 422 caches[1].id_shift = pkg_id_shift; 423 caches[1].present = 1; 424 return; 425 } 426 427 for (i = 0; ; i++) { 428 cpuid_count(0x4, i, p); 429 type = p[0] & 0x1f; 430 level = (p[0] >> 5) & 0x7; 431 share_count = 1 + ((p[0] >> 14) & 0xfff); 432 433 if (!add_deterministic_cache(type, level, share_count)) 434 break; 435 } 436} 437 438/* 439 * Determine topology of processing units and caches for Intel CPUs. 440 * See: 441 * - Intel 64 Architecture Processor Topology Enumeration 442 */ 443static void 444topo_probe_intel(void) 445{ 446 447 /* 448 * Note that 0x1 <= cpu_high < 4 case should be 449 * compatible with topo_probe_intel_0x4() logic when 450 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) 451 * or it should trigger the fallback otherwise. 452 */ 453 if (cpu_high >= 0xb) 454 topo_probe_intel_0xb(); 455 else if (cpu_high >= 0x1) 456 topo_probe_intel_0x4(); 457 458 topo_probe_intel_caches(); 459} 460 461/* 462 * Topology information is queried only on BSP, on which this 463 * code runs and for which it can query CPUID information. 464 * Then topology is extrapolated on all packages using an 465 * assumption that APIC ID to hardware component ID mapping is 466 * homogenious. 467 * That doesn't necesserily imply that the topology is uniform. 468 */ 469void 470topo_probe(void) 471{ 472 static int cpu_topo_probed = 0; 473 struct x86_topo_layer { 474 int type; 475 int subtype; 476 int id_shift; 477 } topo_layers[MAX_CACHE_LEVELS + 3]; 478 struct topo_node *parent; 479 struct topo_node *node; 480 int layer; 481 int nlayers; 482 int node_id; 483 int i; 484 485 if (cpu_topo_probed) 486 return; 487 488 CPU_ZERO(&logical_cpus_mask); 489 490 if (mp_ncpus <= 1) 491 ; /* nothing */ 492 else if (cpu_vendor_id == CPU_VENDOR_AMD) 493 topo_probe_amd(); 494 else if (cpu_vendor_id == CPU_VENDOR_INTEL) 495 topo_probe_intel(); 496 497 KASSERT(pkg_id_shift >= core_id_shift, 498 ("bug in APIC topology discovery")); 499 500 nlayers = 0; 501 bzero(topo_layers, sizeof(topo_layers)); 502 503 topo_layers[nlayers].type = TOPO_TYPE_PKG; 504 topo_layers[nlayers].id_shift = pkg_id_shift; 505 if (bootverbose) 506 printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift); 507 nlayers++; 508 509 /* 510 * Consider all caches to be within a package/chip 511 * and "in front" of all sub-components like 512 * cores and hardware threads. 513 */ 514 for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) { 515 if (caches[i].present) { 516 KASSERT(caches[i].id_shift <= pkg_id_shift, 517 ("bug in APIC topology discovery")); 518 KASSERT(caches[i].id_shift >= core_id_shift, 519 ("bug in APIC topology discovery")); 520 521 topo_layers[nlayers].type = TOPO_TYPE_CACHE; 522 topo_layers[nlayers].subtype = i + 1; 523 topo_layers[nlayers].id_shift = caches[i].id_shift; 524 if (bootverbose) 525 printf("L%u cache ID shift: %u\n", 526 topo_layers[nlayers].subtype, 527 topo_layers[nlayers].id_shift); 528 nlayers++; 529 } 530 } 531 532 if (pkg_id_shift > core_id_shift) { 533 topo_layers[nlayers].type = TOPO_TYPE_CORE; 534 topo_layers[nlayers].id_shift = core_id_shift; 535 if (bootverbose) 536 printf("Core ID shift: %u\n", 537 topo_layers[nlayers].id_shift); 538 nlayers++; 539 } 540 541 topo_layers[nlayers].type = TOPO_TYPE_PU; 542 topo_layers[nlayers].id_shift = 0; 543 nlayers++; 544 545 topo_init_root(&topo_root); 546 for (i = 0; i <= MAX_APIC_ID; ++i) { 547 if (!cpu_info[i].cpu_present) 548 continue; 549 550 parent = &topo_root; 551 for (layer = 0; layer < nlayers; ++layer) { 552 node_id = i >> topo_layers[layer].id_shift; 553 parent = topo_add_node_by_hwid(parent, node_id, 554 topo_layers[layer].type, 555 topo_layers[layer].subtype); 556 } 557 } 558 559 parent = &topo_root; 560 for (layer = 0; layer < nlayers; ++layer) { 561 node_id = boot_cpu_id >> topo_layers[layer].id_shift; 562 node = topo_find_node_by_hwid(parent, node_id, 563 topo_layers[layer].type, 564 topo_layers[layer].subtype); 565 topo_promote_child(node); 566 parent = node; 567 } 568 569 cpu_topo_probed = 1; 570} 571 572/* 573 * Assign logical CPU IDs to local APICs. 574 */ 575void 576assign_cpu_ids(void) 577{ 578 struct topo_node *node; 579 u_int smt_mask; 580 581 smt_mask = (1u << core_id_shift) - 1; 582 583 /* 584 * Assign CPU IDs to local APIC IDs and disable any CPUs 585 * beyond MAXCPU. CPU 0 is always assigned to the BSP. 586 */ 587 mp_ncpus = 0; 588 TOPO_FOREACH(node, &topo_root) { 589 if (node->type != TOPO_TYPE_PU) 590 continue; 591 592 if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask)) 593 cpu_info[node->hwid].cpu_hyperthread = 1; 594 595 if (resource_disabled("lapic", node->hwid)) { 596 if (node->hwid != boot_cpu_id) 597 cpu_info[node->hwid].cpu_disabled = 1; 598 else 599 printf("Cannot disable BSP, APIC ID = %d\n", 600 node->hwid); 601 } 602 603 if (!hyperthreading_allowed && 604 cpu_info[node->hwid].cpu_hyperthread) 605 cpu_info[node->hwid].cpu_disabled = 1; 606 607 if (mp_ncpus >= MAXCPU) 608 cpu_info[node->hwid].cpu_disabled = 1; 609 610 if (cpu_info[node->hwid].cpu_disabled) { 611 disabled_cpus++; 612 continue; 613 } 614 615 cpu_apic_ids[mp_ncpus] = node->hwid; 616 apic_cpuids[node->hwid] = mp_ncpus; 617 topo_set_pu_id(node, mp_ncpus); 618 mp_ncpus++; 619 } 620 621 KASSERT(mp_maxid >= mp_ncpus - 1, 622 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 623 mp_ncpus)); 624} 625 626/* 627 * Print various information about the SMP system hardware and setup. 628 */ 629void 630cpu_mp_announce(void) 631{ 632 struct topo_node *node; 633 const char *hyperthread; 634 int pkg_count; 635 int cores_per_pkg; 636 int thrs_per_core; 637 638 printf("FreeBSD/SMP: "); 639 if (topo_analyze(&topo_root, 1, &pkg_count, 640 &cores_per_pkg, &thrs_per_core)) { 641 printf("%d package(s)", pkg_count); 642 if (cores_per_pkg > 0) 643 printf(" x %d core(s)", cores_per_pkg); 644 if (thrs_per_core > 1) 645 printf(" x %d hardware threads", thrs_per_core); 646 } else { 647 printf("Non-uniform topology"); 648 } 649 printf("\n"); 650 651 if (disabled_cpus) { 652 printf("FreeBSD/SMP Online: "); 653 if (topo_analyze(&topo_root, 0, &pkg_count, 654 &cores_per_pkg, &thrs_per_core)) { 655 printf("%d package(s)", pkg_count); 656 if (cores_per_pkg > 0) 657 printf(" x %d core(s)", cores_per_pkg); 658 if (thrs_per_core > 1) 659 printf(" x %d hardware threads", thrs_per_core); 660 } else { 661 printf("Non-uniform topology"); 662 } 663 printf("\n"); 664 } 665 666 if (!bootverbose) 667 return; 668 669 TOPO_FOREACH(node, &topo_root) { 670 switch (node->type) { 671 case TOPO_TYPE_PKG: 672 printf("Package HW ID = %u (%#x)\n", 673 node->hwid, node->hwid); 674 break; 675 case TOPO_TYPE_CORE: 676 printf("\tCore HW ID = %u (%#x)\n", 677 node->hwid, node->hwid); 678 break; 679 case TOPO_TYPE_PU: 680 if (cpu_info[node->hwid].cpu_hyperthread) 681 hyperthread = "/HT"; 682 else 683 hyperthread = ""; 684 685 if (node->subtype == 0) 686 printf("\t\tCPU (AP%s): APIC ID: %u (%#x)" 687 "(disabled)\n", hyperthread, node->hwid, 688 node->hwid); 689 else if (node->id == 0) 690 printf("\t\tCPU0 (BSP): APIC ID: %u (%#x)\n", 691 node->hwid, node->hwid); 692 else 693 printf("\t\tCPU%u (AP%s): APIC ID: %u (%#x)\n", 694 node->id, hyperthread, node->hwid, 695 node->hwid); 696 break; 697 default: 698 /* ignored */ 699 break; 700 } 701 } 702} 703 704/* 705 * Add a scheduling group, a group of logical processors sharing 706 * a particular cache (and, thus having an affinity), to the scheduling 707 * topology. 708 * This function recursively works on lower level caches. 709 */ 710static void 711x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root) 712{ 713 struct topo_node *node; 714 int nchildren; 715 int ncores; 716 int i; 717 718 KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE, 719 ("x86topo_add_sched_group: bad type: %u", root->type)); 720 CPU_COPY(&root->cpuset, &cg_root->cg_mask); 721 cg_root->cg_count = root->cpu_count; 722 if (root->type == TOPO_TYPE_SYSTEM) 723 cg_root->cg_level = CG_SHARE_NONE; 724 else 725 cg_root->cg_level = root->subtype; 726 727 /* 728 * Check how many core nodes we have under the given root node. 729 * If we have multiple logical processors, but not multiple 730 * cores, then those processors must be hardware threads. 731 */ 732 ncores = 0; 733 node = root; 734 while (node != NULL) { 735 if (node->type != TOPO_TYPE_CORE) { 736 node = topo_next_node(root, node); 737 continue; 738 } 739 740 ncores++; 741 node = topo_next_nonchild_node(root, node); 742 } 743 744 if (cg_root->cg_level != CG_SHARE_NONE && 745 root->cpu_count > 1 && ncores < 2) 746 cg_root->cg_flags = CG_FLAG_SMT; 747 748 /* 749 * Find out how many cache nodes we have under the given root node. 750 * We ignore cache nodes that cover all the same processors as the 751 * root node. Also, we do not descend below found cache nodes. 752 * That is, we count top-level "non-redundant" caches under the root 753 * node. 754 */ 755 nchildren = 0; 756 node = root; 757 while (node != NULL) { 758 if (node->type != TOPO_TYPE_CACHE || 759 (root->type != TOPO_TYPE_SYSTEM && 760 CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { 761 node = topo_next_node(root, node); 762 continue; 763 } 764 nchildren++; 765 node = topo_next_nonchild_node(root, node); 766 } 767 768 cg_root->cg_child = smp_topo_alloc(nchildren); 769 cg_root->cg_children = nchildren; 770 771 /* 772 * Now find again the same cache nodes as above and recursively 773 * build scheduling topologies for them. 774 */ 775 node = root; 776 i = 0; 777 while (node != NULL) { 778 if (node->type != TOPO_TYPE_CACHE || 779 (root->type != TOPO_TYPE_SYSTEM && 780 CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { 781 node = topo_next_node(root, node); 782 continue; 783 } 784 cg_root->cg_child[i].cg_parent = cg_root; 785 x86topo_add_sched_group(node, &cg_root->cg_child[i]); 786 i++; 787 node = topo_next_nonchild_node(root, node); 788 } 789} 790 791/* 792 * Build the MI scheduling topology from the discovered hardware topology. 793 */ 794struct cpu_group * 795cpu_topo(void) 796{ 797 struct cpu_group *cg_root; 798 799 if (mp_ncpus <= 1) 800 return (smp_topo_none()); 801 802 cg_root = smp_topo_alloc(1); 803 x86topo_add_sched_group(&topo_root, cg_root); 804 return (cg_root); 805} 806 807 808/* 809 * Add a logical CPU to the topology. 810 */ 811void 812cpu_add(u_int apic_id, char boot_cpu) 813{ 814 815 if (apic_id > MAX_APIC_ID) { 816 panic("SMP: APIC ID %d too high", apic_id); 817 return; 818 } 819 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", 820 apic_id)); 821 cpu_info[apic_id].cpu_present = 1; 822 if (boot_cpu) { 823 KASSERT(boot_cpu_id == -1, 824 ("CPU %d claims to be BSP, but CPU %d already is", apic_id, 825 boot_cpu_id)); 826 boot_cpu_id = apic_id; 827 cpu_info[apic_id].cpu_bsp = 1; 828 } 829 if (mp_ncpus < MAXCPU) { 830 mp_ncpus++; 831 mp_maxid = mp_ncpus - 1; 832 } 833 if (bootverbose) 834 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : 835 "AP"); 836} 837 838void 839cpu_mp_setmaxid(void) 840{ 841 842 /* 843 * mp_ncpus and mp_maxid should be already set by calls to cpu_add(). 844 * If there were no calls to cpu_add() assume this is a UP system. 845 */ 846 if (mp_ncpus == 0) 847 mp_ncpus = 1; 848} 849 850int 851cpu_mp_probe(void) 852{ 853 854 /* 855 * Always record BSP in CPU map so that the mbuf init code works 856 * correctly. 857 */ 858 CPU_SETOF(0, &all_cpus); 859 return (mp_ncpus > 1); 860} 861 862/* 863 * AP CPU's call this to initialize themselves. 864 */ 865void 866init_secondary_tail(void) 867{ 868 u_int cpuid; 869 870 pmap_activate_boot(vmspace_pmap(proc0.p_vmspace)); 871 872 /* 873 * On real hardware, switch to x2apic mode if possible. Do it 874 * after aps_ready was signalled, to avoid manipulating the 875 * mode while BSP might still want to send some IPI to us 876 * (second startup IPI is ignored on modern hardware etc). 877 */ 878 lapic_xapic_mode(); 879 880 /* Initialize the PAT MSR. */ 881 pmap_init_pat(); 882 883 /* set up CPU registers and state */ 884 cpu_setregs(); 885 886 /* set up SSE/NX */ 887 initializecpu(); 888 889 /* set up FPU state on the AP */ 890#ifdef __amd64__ 891 fpuinit(); 892#else 893 npxinit(false); 894#endif 895 896 if (cpu_ops.cpu_init) 897 cpu_ops.cpu_init(); 898 899 /* A quick check from sanity claus */ 900 cpuid = PCPU_GET(cpuid); 901 if (PCPU_GET(apic_id) != lapic_id()) { 902 printf("SMP: cpuid = %d\n", cpuid); 903 printf("SMP: actual apic_id = %d\n", lapic_id()); 904 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 905 panic("cpuid mismatch! boom!!"); 906 } 907 908 /* Initialize curthread. */ 909 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 910 PCPU_SET(curthread, PCPU_GET(idlethread)); 911 912 mtx_lock_spin(&ap_boot_mtx); 913 914 mca_init(); 915 916 /* Init local apic for irq's */ 917 lapic_setup(1); 918 919 /* Set memory range attributes for this CPU to match the BSP */ 920 mem_range_AP_init(); 921 922 smp_cpus++; 923 924 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); 925 printf("SMP: AP CPU #%d Launched!\n", cpuid); 926 927 /* Determine if we are a logical CPU. */ 928 if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread) 929 CPU_SET(cpuid, &logical_cpus_mask); 930 931 if (bootverbose) 932 lapic_dump("AP"); 933 934 if (smp_cpus == mp_ncpus) { 935 /* enable IPI's, tlb shootdown, freezes etc */ 936 atomic_store_rel_int(&smp_started, 1); 937 } 938 939#ifdef __amd64__ 940 /* 941 * Enable global pages TLB extension 942 * This also implicitly flushes the TLB 943 */ 944 load_cr4(rcr4() | CR4_PGE); 945 if (pmap_pcid_enabled) 946 load_cr4(rcr4() | CR4_PCIDE); 947 load_ds(_udatasel); 948 load_es(_udatasel); 949 load_fs(_ufssel); 950#endif 951 952 mtx_unlock_spin(&ap_boot_mtx); 953 954 /* Wait until all the AP's are up. */ 955 while (atomic_load_acq_int(&smp_started) == 0) 956 ia32_pause(); 957 958#ifndef EARLY_AP_STARTUP 959 /* Start per-CPU event timers. */ 960 cpu_initclocks_ap(); 961#endif 962 963 sched_throw(NULL); 964 965 panic("scheduler returned us to %s", __func__); 966 /* NOTREACHED */ 967} 968 969/******************************************************************* 970 * local functions and data 971 */ 972 973/* 974 * We tell the I/O APIC code about all the CPUs we want to receive 975 * interrupts. If we don't want certain CPUs to receive IRQs we 976 * can simply not tell the I/O APIC code about them in this function. 977 * We also do not tell it about the BSP since it tells itself about 978 * the BSP internally to work with UP kernels and on UP machines. 979 */ 980void 981set_interrupt_apic_ids(void) 982{ 983 u_int i, apic_id; 984 985 for (i = 0; i < MAXCPU; i++) { 986 apic_id = cpu_apic_ids[i]; 987 if (apic_id == -1) 988 continue; 989 if (cpu_info[apic_id].cpu_bsp) 990 continue; 991 if (cpu_info[apic_id].cpu_disabled) 992 continue; 993 994 /* Don't let hyperthreads service interrupts. */ 995 if (cpu_info[apic_id].cpu_hyperthread) 996 continue; 997 998 intr_add_cpu(i); 999 } 1000} 1001 1002 1003#ifdef COUNT_XINVLTLB_HITS 1004u_int xhits_gbl[MAXCPU]; 1005u_int xhits_pg[MAXCPU]; 1006u_int xhits_rng[MAXCPU]; 1007static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); 1008SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, 1009 sizeof(xhits_gbl), "IU", ""); 1010SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, 1011 sizeof(xhits_pg), "IU", ""); 1012SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, 1013 sizeof(xhits_rng), "IU", ""); 1014 1015u_int ipi_global; 1016u_int ipi_page; 1017u_int ipi_range; 1018u_int ipi_range_size; 1019SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); 1020SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); 1021SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); 1022SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 1023 0, ""); 1024#endif /* COUNT_XINVLTLB_HITS */ 1025 1026/* 1027 * Init and startup IPI. 1028 */ 1029void 1030ipi_startup(int apic_id, int vector) 1031{ 1032 1033 /* 1034 * This attempts to follow the algorithm described in the 1035 * Intel Multiprocessor Specification v1.4 in section B.4. 1036 * For each IPI, we allow the local APIC ~20us to deliver the 1037 * IPI. If that times out, we panic. 1038 */ 1039 1040 /* 1041 * first we do an INIT IPI: this INIT IPI might be run, resetting 1042 * and running the target CPU. OR this INIT IPI might be latched (P5 1043 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 1044 * ignored. 1045 */ 1046 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1047 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 1048 lapic_ipi_wait(100); 1049 1050 /* Explicitly deassert the INIT IPI. */ 1051 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1052 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 1053 apic_id); 1054 1055 DELAY(10000); /* wait ~10mS */ 1056 1057 /* 1058 * next we do a STARTUP IPI: the previous INIT IPI might still be 1059 * latched, (P5 bug) this 1st STARTUP would then terminate 1060 * immediately, and the previously started INIT IPI would continue. OR 1061 * the previous INIT IPI has already run. and this STARTUP IPI will 1062 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 1063 * will run. 1064 */ 1065 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1066 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1067 vector, apic_id); 1068 if (!lapic_ipi_wait(100)) 1069 panic("Failed to deliver first STARTUP IPI to APIC %d", 1070 apic_id); 1071 DELAY(200); /* wait ~200uS */ 1072 1073 /* 1074 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 1075 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 1076 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 1077 * recognized after hardware RESET or INIT IPI. 1078 */ 1079 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1080 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1081 vector, apic_id); 1082 if (!lapic_ipi_wait(100)) 1083 panic("Failed to deliver second STARTUP IPI to APIC %d", 1084 apic_id); 1085 1086 DELAY(200); /* wait ~200uS */ 1087} 1088 1089/* 1090 * Send an IPI to specified CPU handling the bitmap logic. 1091 */ 1092void 1093ipi_send_cpu(int cpu, u_int ipi) 1094{ 1095 u_int bitmap, old_pending, new_pending; 1096 1097 KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); 1098 1099 if (IPI_IS_BITMAPED(ipi)) { 1100 bitmap = 1 << ipi; 1101 ipi = IPI_BITMAP_VECTOR; 1102 do { 1103 old_pending = cpu_ipi_pending[cpu]; 1104 new_pending = old_pending | bitmap; 1105 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], 1106 old_pending, new_pending)); 1107 if (old_pending) 1108 return; 1109 } 1110 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); 1111} 1112 1113void 1114ipi_bitmap_handler(struct trapframe frame) 1115{ 1116 struct trapframe *oldframe; 1117 struct thread *td; 1118 int cpu = PCPU_GET(cpuid); 1119 u_int ipi_bitmap; 1120 1121 critical_enter(); 1122 td = curthread; 1123 td->td_intr_nesting_level++; 1124 oldframe = td->td_intr_frame; 1125 td->td_intr_frame = &frame; 1126 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 1127 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 1128#ifdef COUNT_IPIS 1129 (*ipi_preempt_counts[cpu])++; 1130#endif 1131 sched_preempt(td); 1132 } 1133 if (ipi_bitmap & (1 << IPI_AST)) { 1134#ifdef COUNT_IPIS 1135 (*ipi_ast_counts[cpu])++; 1136#endif 1137 /* Nothing to do for AST */ 1138 } 1139 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { 1140#ifdef COUNT_IPIS 1141 (*ipi_hardclock_counts[cpu])++; 1142#endif 1143 hardclockintr(); 1144 } 1145 td->td_intr_frame = oldframe; 1146 td->td_intr_nesting_level--; 1147 critical_exit(); 1148} 1149 1150/* 1151 * send an IPI to a set of cpus. 1152 */ 1153void 1154ipi_selected(cpuset_t cpus, u_int ipi) 1155{ 1156 int cpu; 1157 1158 /* 1159 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1160 * of help in order to understand what is the source. 1161 * Set the mask of receiving CPUs for this purpose. 1162 */ 1163 if (ipi == IPI_STOP_HARD) 1164 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus); 1165 1166 while ((cpu = CPU_FFS(&cpus)) != 0) { 1167 cpu--; 1168 CPU_CLR(cpu, &cpus); 1169 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1170 ipi_send_cpu(cpu, ipi); 1171 } 1172} 1173 1174/* 1175 * send an IPI to a specific CPU. 1176 */ 1177void 1178ipi_cpu(int cpu, u_int ipi) 1179{ 1180 1181 /* 1182 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1183 * of help in order to understand what is the source. 1184 * Set the mask of receiving CPUs for this purpose. 1185 */ 1186 if (ipi == IPI_STOP_HARD) 1187 CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending); 1188 1189 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1190 ipi_send_cpu(cpu, ipi); 1191} 1192 1193/* 1194 * send an IPI to all CPUs EXCEPT myself 1195 */ 1196void 1197ipi_all_but_self(u_int ipi) 1198{ 1199 cpuset_t other_cpus; 1200 1201 other_cpus = all_cpus; 1202 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1203 if (IPI_IS_BITMAPED(ipi)) { 1204 ipi_selected(other_cpus, ipi); 1205 return; 1206 } 1207 1208 /* 1209 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1210 * of help in order to understand what is the source. 1211 * Set the mask of receiving CPUs for this purpose. 1212 */ 1213 if (ipi == IPI_STOP_HARD) 1214 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus); 1215 1216 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1217 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 1218} 1219 1220int 1221ipi_nmi_handler(void) 1222{ 1223 u_int cpuid; 1224 1225 /* 1226 * As long as there is not a simple way to know about a NMI's 1227 * source, if the bitmask for the current CPU is present in 1228 * the global pending bitword an IPI_STOP_HARD has been issued 1229 * and should be handled. 1230 */ 1231 cpuid = PCPU_GET(cpuid); 1232 if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending)) 1233 return (1); 1234 1235 CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending); 1236 cpustop_handler(); 1237 return (0); 1238} 1239 1240int nmi_kdb_lock; 1241 1242void 1243nmi_call_kdb_smp(u_int type, struct trapframe *frame) 1244{ 1245 int cpu; 1246 bool call_post; 1247 1248 cpu = PCPU_GET(cpuid); 1249 if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) { 1250 nmi_call_kdb(cpu, type, frame); 1251 call_post = false; 1252 } else { 1253 savectx(&stoppcbs[cpu]); 1254 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1255 while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) 1256 ia32_pause(); 1257 call_post = true; 1258 } 1259 atomic_store_rel_int(&nmi_kdb_lock, 0); 1260 if (call_post) 1261 cpustop_handler_post(cpu); 1262} 1263 1264/* 1265 * Handle an IPI_STOP by saving our current context and spinning until we 1266 * are resumed. 1267 */ 1268void 1269cpustop_handler(void) 1270{ 1271 u_int cpu; 1272 1273 cpu = PCPU_GET(cpuid); 1274 1275 savectx(&stoppcbs[cpu]); 1276 1277 /* Indicate that we are stopped */ 1278 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1279 1280 /* Wait for restart */ 1281 while (!CPU_ISSET(cpu, &started_cpus)) 1282 ia32_pause(); 1283 1284 cpustop_handler_post(cpu); 1285} 1286 1287static void 1288cpustop_handler_post(u_int cpu) 1289{ 1290 1291 CPU_CLR_ATOMIC(cpu, &started_cpus); 1292 CPU_CLR_ATOMIC(cpu, &stopped_cpus); 1293 1294#if defined(__amd64__) && defined(DDB) 1295 amd64_db_resume_dbreg(); 1296#endif 1297 1298 if (cpu == 0 && cpustop_restartfunc != NULL) { 1299 cpustop_restartfunc(); 1300 cpustop_restartfunc = NULL; 1301 } 1302} 1303 1304/* 1305 * Handle an IPI_SUSPEND by saving our current context and spinning until we 1306 * are resumed. 1307 */ 1308void 1309cpususpend_handler(void) 1310{ 1311 u_int cpu; 1312 1313 mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); 1314 1315 cpu = PCPU_GET(cpuid); 1316 if (savectx(&susppcbs[cpu]->sp_pcb)) { 1317#ifdef __amd64__ 1318 fpususpend(susppcbs[cpu]->sp_fpususpend); 1319#else 1320 npxsuspend(susppcbs[cpu]->sp_fpususpend); 1321#endif 1322 /* 1323 * suspended_cpus is cleared shortly after each AP is restarted 1324 * by a Startup IPI, so that the BSP can proceed to restarting 1325 * the next AP. 1326 * 1327 * resuming_cpus gets cleared when the AP completes 1328 * initialization after having been released by the BSP. 1329 * resuming_cpus is probably not the best name for the 1330 * variable, because it is actually a set of processors that 1331 * haven't resumed yet and haven't necessarily started resuming. 1332 * 1333 * Note that suspended_cpus is meaningful only for ACPI suspend 1334 * as it's not really used for Xen suspend since the APs are 1335 * automatically restored to the running state and the correct 1336 * context. For the same reason resumectx is never called in 1337 * that case. 1338 */ 1339 CPU_SET_ATOMIC(cpu, &suspended_cpus); 1340 CPU_SET_ATOMIC(cpu, &resuming_cpus); 1341 1342 /* 1343 * Invalidate the cache after setting the global status bits. 1344 * The last AP to set its bit may end up being an Owner of the 1345 * corresponding cache line in MOESI protocol. The AP may be 1346 * stopped before the cache line is written to the main memory. 1347 */ 1348 wbinvd(); 1349 } else { 1350#ifdef __amd64__ 1351 fpuresume(susppcbs[cpu]->sp_fpususpend); 1352#else 1353 npxresume(susppcbs[cpu]->sp_fpususpend); 1354#endif 1355 pmap_init_pat(); 1356 initializecpu(); 1357 PCPU_SET(switchtime, 0); 1358 PCPU_SET(switchticks, ticks); 1359 1360 /* Indicate that we have restarted and restored the context. */ 1361 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1362 } 1363 1364 /* Wait for resume directive */ 1365 while (!CPU_ISSET(cpu, &toresume_cpus)) 1366 ia32_pause(); 1367 1368 /* Re-apply microcode updates. */ 1369 ucode_reload(); 1370 1371 if (cpu_ops.cpu_resume) 1372 cpu_ops.cpu_resume(); 1373#ifdef __amd64__ 1374 if (vmm_resume_p) 1375 vmm_resume_p(); 1376#endif 1377 1378 /* Resume MCA and local APIC */ 1379 lapic_xapic_mode(); 1380 mca_resume(); 1381 lapic_setup(0); 1382 1383 /* Indicate that we are resumed */ 1384 CPU_CLR_ATOMIC(cpu, &resuming_cpus); 1385 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1386 CPU_CLR_ATOMIC(cpu, &toresume_cpus); 1387} 1388 1389 1390void 1391invlcache_handler(void) 1392{ 1393 uint32_t generation; 1394 1395#ifdef COUNT_IPIS 1396 (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; 1397#endif /* COUNT_IPIS */ 1398 1399 /* 1400 * Reading the generation here allows greater parallelism 1401 * since wbinvd is a serializing instruction. Without the 1402 * temporary, we'd wait for wbinvd to complete, then the read 1403 * would execute, then the dependent write, which must then 1404 * complete before return from interrupt. 1405 */ 1406 generation = smp_tlb_generation; 1407 wbinvd(); 1408 PCPU_SET(smp_tlb_done, generation); 1409} 1410 1411/* 1412 * This is called once the rest of the system is up and running and we're 1413 * ready to let the AP's out of the pen. 1414 */ 1415static void 1416release_aps(void *dummy __unused) 1417{ 1418 1419 if (mp_ncpus == 1) 1420 return; 1421 atomic_store_rel_int(&aps_ready, 1); 1422 while (smp_started == 0) 1423 ia32_pause(); 1424} 1425SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1426 1427#ifdef COUNT_IPIS 1428/* 1429 * Setup interrupt counters for IPI handlers. 1430 */ 1431static void 1432mp_ipi_intrcnt(void *dummy) 1433{ 1434 char buf[64]; 1435 int i; 1436 1437 CPU_FOREACH(i) { 1438 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); 1439 intrcnt_add(buf, &ipi_invltlb_counts[i]); 1440 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); 1441 intrcnt_add(buf, &ipi_invlrng_counts[i]); 1442 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); 1443 intrcnt_add(buf, &ipi_invlpg_counts[i]); 1444 snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); 1445 intrcnt_add(buf, &ipi_invlcache_counts[i]); 1446 snprintf(buf, sizeof(buf), "cpu%d:preempt", i); 1447 intrcnt_add(buf, &ipi_preempt_counts[i]); 1448 snprintf(buf, sizeof(buf), "cpu%d:ast", i); 1449 intrcnt_add(buf, &ipi_ast_counts[i]); 1450 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); 1451 intrcnt_add(buf, &ipi_rendezvous_counts[i]); 1452 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); 1453 intrcnt_add(buf, &ipi_hardclock_counts[i]); 1454 } 1455} 1456SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); 1457#endif 1458 1459/* 1460 * Flush the TLB on other CPU's 1461 */ 1462 1463/* Variables needed for SMP tlb shootdown. */ 1464vm_offset_t smp_tlb_addr1, smp_tlb_addr2; 1465pmap_t smp_tlb_pmap; 1466volatile uint32_t smp_tlb_generation; 1467 1468#ifdef __amd64__ 1469#define read_eflags() read_rflags() 1470#endif 1471 1472static void 1473smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, 1474 vm_offset_t addr1, vm_offset_t addr2) 1475{ 1476 cpuset_t other_cpus; 1477 volatile uint32_t *p_cpudone; 1478 uint32_t generation; 1479 int cpu; 1480 1481 /* 1482 * Check for other cpus. Return if none. 1483 */ 1484 if (CPU_ISFULLSET(&mask)) { 1485 if (mp_ncpus <= 1) 1486 return; 1487 } else { 1488 CPU_CLR(PCPU_GET(cpuid), &mask); 1489 if (CPU_EMPTY(&mask)) 1490 return; 1491 } 1492 1493 if (!(read_eflags() & PSL_I)) 1494 panic("%s: interrupts disabled", __func__); 1495 mtx_lock_spin(&smp_ipi_mtx); 1496 smp_tlb_addr1 = addr1; 1497 smp_tlb_addr2 = addr2; 1498 smp_tlb_pmap = pmap; 1499 generation = ++smp_tlb_generation; 1500 if (CPU_ISFULLSET(&mask)) { 1501 ipi_all_but_self(vector); 1502 other_cpus = all_cpus; 1503 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1504 } else { 1505 other_cpus = mask; 1506 while ((cpu = CPU_FFS(&mask)) != 0) { 1507 cpu--; 1508 CPU_CLR(cpu, &mask); 1509 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, 1510 cpu, vector); 1511 ipi_send_cpu(cpu, vector); 1512 } 1513 } 1514 while ((cpu = CPU_FFS(&other_cpus)) != 0) { 1515 cpu--; 1516 CPU_CLR(cpu, &other_cpus); 1517 p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done; 1518 while (*p_cpudone != generation) 1519 ia32_pause(); 1520 } 1521 mtx_unlock_spin(&smp_ipi_mtx); 1522} 1523 1524void 1525smp_masked_invltlb(cpuset_t mask, pmap_t pmap) 1526{ 1527 1528 if (smp_started) { 1529 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0); 1530#ifdef COUNT_XINVLTLB_HITS 1531 ipi_global++; 1532#endif 1533 } 1534} 1535 1536void 1537smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap) 1538{ 1539 1540 if (smp_started) { 1541 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0); 1542#ifdef COUNT_XINVLTLB_HITS 1543 ipi_page++; 1544#endif 1545 } 1546} 1547 1548void 1549smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2, 1550 pmap_t pmap) 1551{ 1552 1553 if (smp_started) { 1554 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, 1555 addr1, addr2); 1556#ifdef COUNT_XINVLTLB_HITS 1557 ipi_range++; 1558 ipi_range_size += (addr2 - addr1) / PAGE_SIZE; 1559#endif 1560 } 1561} 1562 1563void 1564smp_cache_flush(void) 1565{ 1566 1567 if (smp_started) { 1568 smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL, 1569 0, 0); 1570 } 1571} 1572 1573/* 1574 * Handlers for TLB related IPIs 1575 */ 1576void 1577invltlb_handler(void) 1578{ 1579 uint32_t generation; 1580 1581#ifdef COUNT_XINVLTLB_HITS 1582 xhits_gbl[PCPU_GET(cpuid)]++; 1583#endif /* COUNT_XINVLTLB_HITS */ 1584#ifdef COUNT_IPIS 1585 (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; 1586#endif /* COUNT_IPIS */ 1587 1588 /* 1589 * Reading the generation here allows greater parallelism 1590 * since invalidating the TLB is a serializing operation. 1591 */ 1592 generation = smp_tlb_generation; 1593 if (smp_tlb_pmap == kernel_pmap) 1594 invltlb_glob(); 1595 else 1596 invltlb(); 1597 PCPU_SET(smp_tlb_done, generation); 1598} 1599 1600void 1601invlpg_handler(void) 1602{ 1603 uint32_t generation; 1604 1605#ifdef COUNT_XINVLTLB_HITS 1606 xhits_pg[PCPU_GET(cpuid)]++; 1607#endif /* COUNT_XINVLTLB_HITS */ 1608#ifdef COUNT_IPIS 1609 (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; 1610#endif /* COUNT_IPIS */ 1611 1612 generation = smp_tlb_generation; /* Overlap with serialization */ 1613 invlpg(smp_tlb_addr1); 1614 PCPU_SET(smp_tlb_done, generation); 1615} 1616 1617void 1618invlrng_handler(void) 1619{ 1620 vm_offset_t addr, addr2; 1621 uint32_t generation; 1622 1623#ifdef COUNT_XINVLTLB_HITS 1624 xhits_rng[PCPU_GET(cpuid)]++; 1625#endif /* COUNT_XINVLTLB_HITS */ 1626#ifdef COUNT_IPIS 1627 (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; 1628#endif /* COUNT_IPIS */ 1629 1630 addr = smp_tlb_addr1; 1631 addr2 = smp_tlb_addr2; 1632 generation = smp_tlb_generation; /* Overlap with serialization */ 1633 do { 1634 invlpg(addr); 1635 addr += PAGE_SIZE; 1636 } while (addr < addr2); 1637 1638 PCPU_SET(smp_tlb_done, generation); 1639} 1640