mp_x86.c revision 337118
1/*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2003, by Peter Wemm 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: stable/11/sys/x86/x86/mp_x86.c 337118 2018-08-02 09:00:09Z avg $"); 29 30#ifdef __i386__ 31#include "opt_apic.h" 32#endif 33#include "opt_cpu.h" 34#include "opt_kstack_pages.h" 35#include "opt_pmap.h" 36#include "opt_sched.h" 37#include "opt_smp.h" 38 39#include <sys/param.h> 40#include <sys/systm.h> 41#include <sys/bus.h> 42#include <sys/cons.h> /* cngetc() */ 43#include <sys/cpuset.h> 44#ifdef GPROF 45#include <sys/gmon.h> 46#endif 47#include <sys/kernel.h> 48#include <sys/ktr.h> 49#include <sys/lock.h> 50#include <sys/malloc.h> 51#include <sys/memrange.h> 52#include <sys/mutex.h> 53#include <sys/pcpu.h> 54#include <sys/proc.h> 55#include <sys/sched.h> 56#include <sys/smp.h> 57#include <sys/sysctl.h> 58 59#include <vm/vm.h> 60#include <vm/vm_param.h> 61#include <vm/pmap.h> 62#include <vm/vm_kern.h> 63#include <vm/vm_extern.h> 64 65#include <x86/apicreg.h> 66#include <machine/clock.h> 67#include <machine/cputypes.h> 68#include <x86/mca.h> 69#include <machine/md_var.h> 70#include <machine/pcb.h> 71#include <machine/psl.h> 72#include <machine/smp.h> 73#include <machine/specialreg.h> 74#include <machine/cpu.h> 75 76/* lock region used by kernel profiling */ 77int mcount_lock; 78 79int mp_naps; /* # of Applications processors */ 80int boot_cpu_id = -1; /* designated BSP */ 81 82extern struct pcpu __pcpu[]; 83 84/* AP uses this during bootstrap. Do not staticize. */ 85char *bootSTK; 86int bootAP; 87 88/* Free these after use */ 89void *bootstacks[MAXCPU]; 90void *dpcpu; 91 92struct pcb stoppcbs[MAXCPU]; 93struct susppcb **susppcbs; 94 95#ifdef COUNT_IPIS 96/* Interrupt counts. */ 97static u_long *ipi_preempt_counts[MAXCPU]; 98static u_long *ipi_ast_counts[MAXCPU]; 99u_long *ipi_invltlb_counts[MAXCPU]; 100u_long *ipi_invlrng_counts[MAXCPU]; 101u_long *ipi_invlpg_counts[MAXCPU]; 102u_long *ipi_invlcache_counts[MAXCPU]; 103u_long *ipi_rendezvous_counts[MAXCPU]; 104static u_long *ipi_hardclock_counts[MAXCPU]; 105#endif 106 107/* Default cpu_ops implementation. */ 108struct cpu_ops cpu_ops; 109 110/* 111 * Local data and functions. 112 */ 113 114static volatile cpuset_t ipi_stop_nmi_pending; 115 116volatile cpuset_t resuming_cpus; 117volatile cpuset_t toresume_cpus; 118 119/* used to hold the AP's until we are ready to release them */ 120struct mtx ap_boot_mtx; 121 122/* Set to 1 once we're ready to let the APs out of the pen. */ 123volatile int aps_ready = 0; 124 125/* 126 * Store data from cpu_add() until later in the boot when we actually setup 127 * the APs. 128 */ 129struct cpu_info cpu_info[MAX_APIC_ID + 1]; 130int apic_cpuids[MAX_APIC_ID + 1]; 131int cpu_apic_ids[MAXCPU]; 132 133/* Holds pending bitmap based IPIs per CPU */ 134volatile u_int cpu_ipi_pending[MAXCPU]; 135 136static void release_aps(void *dummy); 137static void cpustop_handler_post(u_int cpu); 138 139static int hyperthreading_allowed = 1; 140SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN, 141 &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs"); 142 143static struct topo_node topo_root; 144 145static int pkg_id_shift; 146static int core_id_shift; 147static int disabled_cpus; 148 149struct cache_info { 150 int id_shift; 151 int present; 152} static caches[MAX_CACHE_LEVELS]; 153 154void 155mem_range_AP_init(void) 156{ 157 158 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 159 mem_range_softc.mr_op->initAP(&mem_range_softc); 160} 161 162/* 163 * Round up to the next power of two, if necessary, and then 164 * take log2. 165 * Returns -1 if argument is zero. 166 */ 167static __inline int 168mask_width(u_int x) 169{ 170 171 return (fls(x << (1 - powerof2(x))) - 1); 172} 173 174/* 175 * Add a cache level to the cache topology description. 176 */ 177static int 178add_deterministic_cache(int type, int level, int share_count) 179{ 180 181 if (type == 0) 182 return (0); 183 if (type > 3) { 184 printf("unexpected cache type %d\n", type); 185 return (1); 186 } 187 if (type == 2) /* ignore instruction cache */ 188 return (1); 189 if (level == 0 || level > MAX_CACHE_LEVELS) { 190 printf("unexpected cache level %d\n", type); 191 return (1); 192 } 193 194 if (caches[level - 1].present) { 195 printf("WARNING: multiple entries for L%u data cache\n", level); 196 printf("%u => %u\n", caches[level - 1].id_shift, 197 mask_width(share_count)); 198 } 199 caches[level - 1].id_shift = mask_width(share_count); 200 caches[level - 1].present = 1; 201 202 if (caches[level - 1].id_shift > pkg_id_shift) { 203 printf("WARNING: L%u data cache covers more " 204 "APIC IDs than a package\n", level); 205 printf("%u > %u\n", caches[level - 1].id_shift, pkg_id_shift); 206 caches[level - 1].id_shift = pkg_id_shift; 207 } 208 if (caches[level - 1].id_shift < core_id_shift) { 209 printf("WARNING: L%u data cache covers less " 210 "APIC IDs than a core\n", level); 211 printf("%u < %u\n", caches[level - 1].id_shift, core_id_shift); 212 caches[level - 1].id_shift = core_id_shift; 213 } 214 215 return (1); 216} 217 218/* 219 * Determine topology of processing units and caches for AMD CPUs. 220 * See: 221 * - AMD CPUID Specification (Publication # 25481) 222 * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559) 223 * - BKDG For AMD Family 10h Processors (Publication # 31116) 224 * - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301) 225 * - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751) 226 */ 227static void 228topo_probe_amd(void) 229{ 230 u_int p[4]; 231 uint64_t v; 232 int level; 233 int nodes_per_socket; 234 int share_count; 235 int type; 236 int i; 237 238 /* No multi-core capability. */ 239 if ((amd_feature2 & AMDID2_CMP) == 0) 240 return; 241 242 /* For families 10h and newer. */ 243 pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >> 244 AMDID_COREID_SIZE_SHIFT; 245 246 /* For 0Fh family. */ 247 if (pkg_id_shift == 0) 248 pkg_id_shift = 249 mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1); 250 251 /* 252 * Families prior to 16h define the following value as 253 * cores per compute unit and we don't really care about the AMD 254 * compute units at the moment. Perhaps we should treat them as 255 * cores and cores within the compute units as hardware threads, 256 * but that's up for debate. 257 * Later families define the value as threads per compute unit, 258 * so we are following AMD's nomenclature here. 259 */ 260 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 && 261 CPUID_TO_FAMILY(cpu_id) >= 0x16) { 262 cpuid_count(0x8000001e, 0, p); 263 share_count = ((p[1] >> 8) & 0xff) + 1; 264 core_id_shift = mask_width(share_count); 265 } 266 267 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) { 268 for (i = 0; ; i++) { 269 cpuid_count(0x8000001d, i, p); 270 type = p[0] & 0x1f; 271 level = (p[0] >> 5) & 0x7; 272 share_count = 1 + ((p[0] >> 14) & 0xfff); 273 274 if (!add_deterministic_cache(type, level, share_count)) 275 break; 276 } 277 } else { 278 if (cpu_exthigh >= 0x80000005) { 279 cpuid_count(0x80000005, 0, p); 280 if (((p[2] >> 24) & 0xff) != 0) { 281 caches[0].id_shift = 0; 282 caches[0].present = 1; 283 } 284 } 285 if (cpu_exthigh >= 0x80000006) { 286 cpuid_count(0x80000006, 0, p); 287 if (((p[2] >> 16) & 0xffff) != 0) { 288 caches[1].id_shift = 0; 289 caches[1].present = 1; 290 } 291 if (((p[3] >> 18) & 0x3fff) != 0) { 292 nodes_per_socket = 1; 293 if ((amd_feature2 & AMDID2_NODE_ID) != 0) { 294 /* 295 * Handle multi-node processors that 296 * have multiple chips, each with its 297 * own L3 cache, on the same die. 298 */ 299 v = rdmsr(0xc001100c); 300 nodes_per_socket = 1 + ((v >> 3) & 0x7); 301 } 302 caches[2].id_shift = 303 pkg_id_shift - mask_width(nodes_per_socket); 304 caches[2].present = 1; 305 } 306 } 307 } 308} 309 310/* 311 * Determine topology of processing units for Intel CPUs 312 * using CPUID Leaf 1 and Leaf 4, if supported. 313 * See: 314 * - Intel 64 Architecture Processor Topology Enumeration 315 * - Intel 64 and IA-32 ArchitecturesSoftware Developer���s Manual, 316 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 317 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 318 */ 319static void 320topo_probe_intel_0x4(void) 321{ 322 u_int p[4]; 323 int max_cores; 324 int max_logical; 325 326 /* Both zero and one here mean one logical processor per package. */ 327 max_logical = (cpu_feature & CPUID_HTT) != 0 ? 328 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; 329 if (max_logical <= 1) 330 return; 331 332 if (cpu_high >= 0x4) { 333 cpuid_count(0x04, 0, p); 334 max_cores = ((p[0] >> 26) & 0x3f) + 1; 335 } else 336 max_cores = 1; 337 338 core_id_shift = mask_width(max_logical/max_cores); 339 KASSERT(core_id_shift >= 0, 340 ("intel topo: max_cores > max_logical\n")); 341 pkg_id_shift = core_id_shift + mask_width(max_cores); 342} 343 344/* 345 * Determine topology of processing units for Intel CPUs 346 * using CPUID Leaf 11, if supported. 347 * See: 348 * - Intel 64 Architecture Processor Topology Enumeration 349 * - Intel 64 and IA-32 ArchitecturesSoftware Developer���s Manual, 350 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 351 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 352 */ 353static void 354topo_probe_intel_0xb(void) 355{ 356 u_int p[4]; 357 int bits; 358 int type; 359 int i; 360 361 /* Fall back if CPU leaf 11 doesn't really exist. */ 362 cpuid_count(0x0b, 0, p); 363 if (p[1] == 0) { 364 topo_probe_intel_0x4(); 365 return; 366 } 367 368 /* We only support three levels for now. */ 369 for (i = 0; ; i++) { 370 cpuid_count(0x0b, i, p); 371 372 bits = p[0] & 0x1f; 373 type = (p[2] >> 8) & 0xff; 374 375 if (type == 0) 376 break; 377 378 /* TODO: check for duplicate (re-)assignment */ 379 if (type == CPUID_TYPE_SMT) 380 core_id_shift = bits; 381 else if (type == CPUID_TYPE_CORE) 382 pkg_id_shift = bits; 383 else 384 printf("unknown CPU level type %d\n", type); 385 } 386 387 if (pkg_id_shift < core_id_shift) { 388 printf("WARNING: core covers more APIC IDs than a package\n"); 389 core_id_shift = pkg_id_shift; 390 } 391} 392 393/* 394 * Determine topology of caches for Intel CPUs. 395 * See: 396 * - Intel 64 Architecture Processor Topology Enumeration 397 * - Intel 64 and IA-32 Architectures Software Developer���s Manual 398 * Volume 2A: Instruction Set Reference, A-M, 399 * CPUID instruction 400 */ 401static void 402topo_probe_intel_caches(void) 403{ 404 u_int p[4]; 405 int level; 406 int share_count; 407 int type; 408 int i; 409 410 if (cpu_high < 0x4) { 411 /* 412 * Available cache level and sizes can be determined 413 * via CPUID leaf 2, but that requires a huge table of hardcoded 414 * values, so for now just assume L1 and L2 caches potentially 415 * shared only by HTT processing units, if HTT is present. 416 */ 417 caches[0].id_shift = pkg_id_shift; 418 caches[0].present = 1; 419 caches[1].id_shift = pkg_id_shift; 420 caches[1].present = 1; 421 return; 422 } 423 424 for (i = 0; ; i++) { 425 cpuid_count(0x4, i, p); 426 type = p[0] & 0x1f; 427 level = (p[0] >> 5) & 0x7; 428 share_count = 1 + ((p[0] >> 14) & 0xfff); 429 430 if (!add_deterministic_cache(type, level, share_count)) 431 break; 432 } 433} 434 435/* 436 * Determine topology of processing units and caches for Intel CPUs. 437 * See: 438 * - Intel 64 Architecture Processor Topology Enumeration 439 */ 440static void 441topo_probe_intel(void) 442{ 443 444 /* 445 * Note that 0x1 <= cpu_high < 4 case should be 446 * compatible with topo_probe_intel_0x4() logic when 447 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) 448 * or it should trigger the fallback otherwise. 449 */ 450 if (cpu_high >= 0xb) 451 topo_probe_intel_0xb(); 452 else if (cpu_high >= 0x1) 453 topo_probe_intel_0x4(); 454 455 topo_probe_intel_caches(); 456} 457 458/* 459 * Topology information is queried only on BSP, on which this 460 * code runs and for which it can query CPUID information. 461 * Then topology is extrapolated on all packages using an 462 * assumption that APIC ID to hardware component ID mapping is 463 * homogenious. 464 * That doesn't necesserily imply that the topology is uniform. 465 */ 466void 467topo_probe(void) 468{ 469 static int cpu_topo_probed = 0; 470 struct x86_topo_layer { 471 int type; 472 int subtype; 473 int id_shift; 474 } topo_layers[MAX_CACHE_LEVELS + 3]; 475 struct topo_node *parent; 476 struct topo_node *node; 477 int layer; 478 int nlayers; 479 int node_id; 480 int i; 481 482 if (cpu_topo_probed) 483 return; 484 485 CPU_ZERO(&logical_cpus_mask); 486 487 if (mp_ncpus <= 1) 488 ; /* nothing */ 489 else if (cpu_vendor_id == CPU_VENDOR_AMD) 490 topo_probe_amd(); 491 else if (cpu_vendor_id == CPU_VENDOR_INTEL) 492 topo_probe_intel(); 493 494 KASSERT(pkg_id_shift >= core_id_shift, 495 ("bug in APIC topology discovery")); 496 497 nlayers = 0; 498 bzero(topo_layers, sizeof(topo_layers)); 499 500 topo_layers[nlayers].type = TOPO_TYPE_PKG; 501 topo_layers[nlayers].id_shift = pkg_id_shift; 502 if (bootverbose) 503 printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift); 504 nlayers++; 505 506 /* 507 * Consider all caches to be within a package/chip 508 * and "in front" of all sub-components like 509 * cores and hardware threads. 510 */ 511 for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) { 512 if (caches[i].present) { 513 KASSERT(caches[i].id_shift <= pkg_id_shift, 514 ("bug in APIC topology discovery")); 515 KASSERT(caches[i].id_shift >= core_id_shift, 516 ("bug in APIC topology discovery")); 517 518 topo_layers[nlayers].type = TOPO_TYPE_CACHE; 519 topo_layers[nlayers].subtype = i + 1; 520 topo_layers[nlayers].id_shift = caches[i].id_shift; 521 if (bootverbose) 522 printf("L%u cache ID shift: %u\n", 523 topo_layers[nlayers].subtype, 524 topo_layers[nlayers].id_shift); 525 nlayers++; 526 } 527 } 528 529 if (pkg_id_shift > core_id_shift) { 530 topo_layers[nlayers].type = TOPO_TYPE_CORE; 531 topo_layers[nlayers].id_shift = core_id_shift; 532 if (bootverbose) 533 printf("Core ID shift: %u\n", 534 topo_layers[nlayers].id_shift); 535 nlayers++; 536 } 537 538 topo_layers[nlayers].type = TOPO_TYPE_PU; 539 topo_layers[nlayers].id_shift = 0; 540 nlayers++; 541 542 topo_init_root(&topo_root); 543 for (i = 0; i <= MAX_APIC_ID; ++i) { 544 if (!cpu_info[i].cpu_present) 545 continue; 546 547 parent = &topo_root; 548 for (layer = 0; layer < nlayers; ++layer) { 549 node_id = i >> topo_layers[layer].id_shift; 550 parent = topo_add_node_by_hwid(parent, node_id, 551 topo_layers[layer].type, 552 topo_layers[layer].subtype); 553 } 554 } 555 556 parent = &topo_root; 557 for (layer = 0; layer < nlayers; ++layer) { 558 node_id = boot_cpu_id >> topo_layers[layer].id_shift; 559 node = topo_find_node_by_hwid(parent, node_id, 560 topo_layers[layer].type, 561 topo_layers[layer].subtype); 562 topo_promote_child(node); 563 parent = node; 564 } 565 566 cpu_topo_probed = 1; 567} 568 569/* 570 * Assign logical CPU IDs to local APICs. 571 */ 572void 573assign_cpu_ids(void) 574{ 575 struct topo_node *node; 576 u_int smt_mask; 577 578 smt_mask = (1u << core_id_shift) - 1; 579 580 /* 581 * Assign CPU IDs to local APIC IDs and disable any CPUs 582 * beyond MAXCPU. CPU 0 is always assigned to the BSP. 583 */ 584 mp_ncpus = 0; 585 TOPO_FOREACH(node, &topo_root) { 586 if (node->type != TOPO_TYPE_PU) 587 continue; 588 589 if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask)) 590 cpu_info[node->hwid].cpu_hyperthread = 1; 591 592 if (resource_disabled("lapic", node->hwid)) { 593 if (node->hwid != boot_cpu_id) 594 cpu_info[node->hwid].cpu_disabled = 1; 595 else 596 printf("Cannot disable BSP, APIC ID = %d\n", 597 node->hwid); 598 } 599 600 if (!hyperthreading_allowed && 601 cpu_info[node->hwid].cpu_hyperthread) 602 cpu_info[node->hwid].cpu_disabled = 1; 603 604 if (mp_ncpus >= MAXCPU) 605 cpu_info[node->hwid].cpu_disabled = 1; 606 607 if (cpu_info[node->hwid].cpu_disabled) { 608 disabled_cpus++; 609 continue; 610 } 611 612 cpu_apic_ids[mp_ncpus] = node->hwid; 613 apic_cpuids[node->hwid] = mp_ncpus; 614 topo_set_pu_id(node, mp_ncpus); 615 mp_ncpus++; 616 } 617 618 KASSERT(mp_maxid >= mp_ncpus - 1, 619 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 620 mp_ncpus)); 621} 622 623/* 624 * Print various information about the SMP system hardware and setup. 625 */ 626void 627cpu_mp_announce(void) 628{ 629 struct topo_node *node; 630 const char *hyperthread; 631 int pkg_count; 632 int cores_per_pkg; 633 int thrs_per_core; 634 635 printf("FreeBSD/SMP: "); 636 if (topo_analyze(&topo_root, 1, &pkg_count, 637 &cores_per_pkg, &thrs_per_core)) { 638 printf("%d package(s)", pkg_count); 639 if (cores_per_pkg > 0) 640 printf(" x %d core(s)", cores_per_pkg); 641 if (thrs_per_core > 1) 642 printf(" x %d hardware threads", thrs_per_core); 643 } else { 644 printf("Non-uniform topology"); 645 } 646 printf("\n"); 647 648 if (disabled_cpus) { 649 printf("FreeBSD/SMP Online: "); 650 if (topo_analyze(&topo_root, 0, &pkg_count, 651 &cores_per_pkg, &thrs_per_core)) { 652 printf("%d package(s)", pkg_count); 653 if (cores_per_pkg > 0) 654 printf(" x %d core(s)", cores_per_pkg); 655 if (thrs_per_core > 1) 656 printf(" x %d hardware threads", thrs_per_core); 657 } else { 658 printf("Non-uniform topology"); 659 } 660 printf("\n"); 661 } 662 663 if (!bootverbose) 664 return; 665 666 TOPO_FOREACH(node, &topo_root) { 667 switch (node->type) { 668 case TOPO_TYPE_PKG: 669 printf("Package HW ID = %u (%#x)\n", 670 node->hwid, node->hwid); 671 break; 672 case TOPO_TYPE_CORE: 673 printf("\tCore HW ID = %u (%#x)\n", 674 node->hwid, node->hwid); 675 break; 676 case TOPO_TYPE_PU: 677 if (cpu_info[node->hwid].cpu_hyperthread) 678 hyperthread = "/HT"; 679 else 680 hyperthread = ""; 681 682 if (node->subtype == 0) 683 printf("\t\tCPU (AP%s): APIC ID: %u (%#x)" 684 "(disabled)\n", hyperthread, node->hwid, 685 node->hwid); 686 else if (node->id == 0) 687 printf("\t\tCPU0 (BSP): APIC ID: %u (%#x)\n", 688 node->hwid, node->hwid); 689 else 690 printf("\t\tCPU%u (AP%s): APIC ID: %u (%#x)\n", 691 node->id, hyperthread, node->hwid, 692 node->hwid); 693 break; 694 default: 695 /* ignored */ 696 break; 697 } 698 } 699} 700 701/* 702 * Add a scheduling group, a group of logical processors sharing 703 * a particular cache (and, thus having an affinity), to the scheduling 704 * topology. 705 * This function recursively works on lower level caches. 706 */ 707static void 708x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root) 709{ 710 struct topo_node *node; 711 int nchildren; 712 int ncores; 713 int i; 714 715 KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE, 716 ("x86topo_add_sched_group: bad type: %u", root->type)); 717 CPU_COPY(&root->cpuset, &cg_root->cg_mask); 718 cg_root->cg_count = root->cpu_count; 719 if (root->type == TOPO_TYPE_SYSTEM) 720 cg_root->cg_level = CG_SHARE_NONE; 721 else 722 cg_root->cg_level = root->subtype; 723 724 /* 725 * Check how many core nodes we have under the given root node. 726 * If we have multiple logical processors, but not multiple 727 * cores, then those processors must be hardware threads. 728 */ 729 ncores = 0; 730 node = root; 731 while (node != NULL) { 732 if (node->type != TOPO_TYPE_CORE) { 733 node = topo_next_node(root, node); 734 continue; 735 } 736 737 ncores++; 738 node = topo_next_nonchild_node(root, node); 739 } 740 741 if (cg_root->cg_level != CG_SHARE_NONE && 742 root->cpu_count > 1 && ncores < 2) 743 cg_root->cg_flags = CG_FLAG_SMT; 744 745 /* 746 * Find out how many cache nodes we have under the given root node. 747 * We ignore cache nodes that cover all the same processors as the 748 * root node. Also, we do not descend below found cache nodes. 749 * That is, we count top-level "non-redundant" caches under the root 750 * node. 751 */ 752 nchildren = 0; 753 node = root; 754 while (node != NULL) { 755 if (node->type != TOPO_TYPE_CACHE || 756 (root->type != TOPO_TYPE_SYSTEM && 757 CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { 758 node = topo_next_node(root, node); 759 continue; 760 } 761 nchildren++; 762 node = topo_next_nonchild_node(root, node); 763 } 764 765 cg_root->cg_child = smp_topo_alloc(nchildren); 766 cg_root->cg_children = nchildren; 767 768 /* 769 * Now find again the same cache nodes as above and recursively 770 * build scheduling topologies for them. 771 */ 772 node = root; 773 i = 0; 774 while (node != NULL) { 775 if (node->type != TOPO_TYPE_CACHE || 776 (root->type != TOPO_TYPE_SYSTEM && 777 CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { 778 node = topo_next_node(root, node); 779 continue; 780 } 781 cg_root->cg_child[i].cg_parent = cg_root; 782 x86topo_add_sched_group(node, &cg_root->cg_child[i]); 783 i++; 784 node = topo_next_nonchild_node(root, node); 785 } 786} 787 788/* 789 * Build the MI scheduling topology from the discovered hardware topology. 790 */ 791struct cpu_group * 792cpu_topo(void) 793{ 794 struct cpu_group *cg_root; 795 796 if (mp_ncpus <= 1) 797 return (smp_topo_none()); 798 799 cg_root = smp_topo_alloc(1); 800 x86topo_add_sched_group(&topo_root, cg_root); 801 return (cg_root); 802} 803 804 805/* 806 * Add a logical CPU to the topology. 807 */ 808void 809cpu_add(u_int apic_id, char boot_cpu) 810{ 811 812 if (apic_id > MAX_APIC_ID) { 813 panic("SMP: APIC ID %d too high", apic_id); 814 return; 815 } 816 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", 817 apic_id)); 818 cpu_info[apic_id].cpu_present = 1; 819 if (boot_cpu) { 820 KASSERT(boot_cpu_id == -1, 821 ("CPU %d claims to be BSP, but CPU %d already is", apic_id, 822 boot_cpu_id)); 823 boot_cpu_id = apic_id; 824 cpu_info[apic_id].cpu_bsp = 1; 825 } 826 if (mp_ncpus < MAXCPU) { 827 mp_ncpus++; 828 mp_maxid = mp_ncpus - 1; 829 } 830 if (bootverbose) 831 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : 832 "AP"); 833} 834 835void 836cpu_mp_setmaxid(void) 837{ 838 839 /* 840 * mp_ncpus and mp_maxid should be already set by calls to cpu_add(). 841 * If there were no calls to cpu_add() assume this is a UP system. 842 */ 843 if (mp_ncpus == 0) 844 mp_ncpus = 1; 845} 846 847int 848cpu_mp_probe(void) 849{ 850 851 /* 852 * Always record BSP in CPU map so that the mbuf init code works 853 * correctly. 854 */ 855 CPU_SETOF(0, &all_cpus); 856 return (mp_ncpus > 1); 857} 858 859/* 860 * AP CPU's call this to initialize themselves. 861 */ 862void 863init_secondary_tail(void) 864{ 865 u_int cpuid; 866 867 /* 868 * On real hardware, switch to x2apic mode if possible. Do it 869 * after aps_ready was signalled, to avoid manipulating the 870 * mode while BSP might still want to send some IPI to us 871 * (second startup IPI is ignored on modern hardware etc). 872 */ 873 lapic_xapic_mode(); 874 875 /* Initialize the PAT MSR. */ 876 pmap_init_pat(); 877 878 /* set up CPU registers and state */ 879 cpu_setregs(); 880 881 /* set up SSE/NX */ 882 initializecpu(); 883 884 /* set up FPU state on the AP */ 885#ifdef __amd64__ 886 fpuinit(); 887#else 888 npxinit(false); 889#endif 890 891 if (cpu_ops.cpu_init) 892 cpu_ops.cpu_init(); 893 894 /* A quick check from sanity claus */ 895 cpuid = PCPU_GET(cpuid); 896 if (PCPU_GET(apic_id) != lapic_id()) { 897 printf("SMP: cpuid = %d\n", cpuid); 898 printf("SMP: actual apic_id = %d\n", lapic_id()); 899 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 900 panic("cpuid mismatch! boom!!"); 901 } 902 903 /* Initialize curthread. */ 904 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 905 PCPU_SET(curthread, PCPU_GET(idlethread)); 906 907 mtx_lock_spin(&ap_boot_mtx); 908 909 mca_init(); 910 911 /* Init local apic for irq's */ 912 lapic_setup(1); 913 914 /* Set memory range attributes for this CPU to match the BSP */ 915 mem_range_AP_init(); 916 917 smp_cpus++; 918 919 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); 920 printf("SMP: AP CPU #%d Launched!\n", cpuid); 921 922 /* Determine if we are a logical CPU. */ 923 if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread) 924 CPU_SET(cpuid, &logical_cpus_mask); 925 926 if (bootverbose) 927 lapic_dump("AP"); 928 929 if (smp_cpus == mp_ncpus) { 930 /* enable IPI's, tlb shootdown, freezes etc */ 931 atomic_store_rel_int(&smp_started, 1); 932 } 933 934#ifdef __amd64__ 935 /* 936 * Enable global pages TLB extension 937 * This also implicitly flushes the TLB 938 */ 939 load_cr4(rcr4() | CR4_PGE); 940 if (pmap_pcid_enabled) 941 load_cr4(rcr4() | CR4_PCIDE); 942 load_ds(_udatasel); 943 load_es(_udatasel); 944 load_fs(_ufssel); 945#endif 946 947 mtx_unlock_spin(&ap_boot_mtx); 948 949 /* Wait until all the AP's are up. */ 950 while (atomic_load_acq_int(&smp_started) == 0) 951 ia32_pause(); 952 953#ifndef EARLY_AP_STARTUP 954 /* Start per-CPU event timers. */ 955 cpu_initclocks_ap(); 956#endif 957 958 sched_throw(NULL); 959 960 panic("scheduler returned us to %s", __func__); 961 /* NOTREACHED */ 962} 963 964/******************************************************************* 965 * local functions and data 966 */ 967 968/* 969 * We tell the I/O APIC code about all the CPUs we want to receive 970 * interrupts. If we don't want certain CPUs to receive IRQs we 971 * can simply not tell the I/O APIC code about them in this function. 972 * We also do not tell it about the BSP since it tells itself about 973 * the BSP internally to work with UP kernels and on UP machines. 974 */ 975void 976set_interrupt_apic_ids(void) 977{ 978 u_int i, apic_id; 979 980 for (i = 0; i < MAXCPU; i++) { 981 apic_id = cpu_apic_ids[i]; 982 if (apic_id == -1) 983 continue; 984 if (cpu_info[apic_id].cpu_bsp) 985 continue; 986 if (cpu_info[apic_id].cpu_disabled) 987 continue; 988 989 /* Don't let hyperthreads service interrupts. */ 990 if (cpu_info[apic_id].cpu_hyperthread) 991 continue; 992 993 intr_add_cpu(i); 994 } 995} 996 997 998#ifdef COUNT_XINVLTLB_HITS 999u_int xhits_gbl[MAXCPU]; 1000u_int xhits_pg[MAXCPU]; 1001u_int xhits_rng[MAXCPU]; 1002static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); 1003SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, 1004 sizeof(xhits_gbl), "IU", ""); 1005SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, 1006 sizeof(xhits_pg), "IU", ""); 1007SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, 1008 sizeof(xhits_rng), "IU", ""); 1009 1010u_int ipi_global; 1011u_int ipi_page; 1012u_int ipi_range; 1013u_int ipi_range_size; 1014SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); 1015SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); 1016SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); 1017SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 1018 0, ""); 1019#endif /* COUNT_XINVLTLB_HITS */ 1020 1021/* 1022 * Init and startup IPI. 1023 */ 1024void 1025ipi_startup(int apic_id, int vector) 1026{ 1027 1028 /* 1029 * This attempts to follow the algorithm described in the 1030 * Intel Multiprocessor Specification v1.4 in section B.4. 1031 * For each IPI, we allow the local APIC ~20us to deliver the 1032 * IPI. If that times out, we panic. 1033 */ 1034 1035 /* 1036 * first we do an INIT IPI: this INIT IPI might be run, resetting 1037 * and running the target CPU. OR this INIT IPI might be latched (P5 1038 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 1039 * ignored. 1040 */ 1041 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1042 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 1043 lapic_ipi_wait(100); 1044 1045 /* Explicitly deassert the INIT IPI. */ 1046 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1047 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 1048 apic_id); 1049 1050 DELAY(10000); /* wait ~10mS */ 1051 1052 /* 1053 * next we do a STARTUP IPI: the previous INIT IPI might still be 1054 * latched, (P5 bug) this 1st STARTUP would then terminate 1055 * immediately, and the previously started INIT IPI would continue. OR 1056 * the previous INIT IPI has already run. and this STARTUP IPI will 1057 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 1058 * will run. 1059 */ 1060 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1061 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1062 vector, apic_id); 1063 if (!lapic_ipi_wait(100)) 1064 panic("Failed to deliver first STARTUP IPI to APIC %d", 1065 apic_id); 1066 DELAY(200); /* wait ~200uS */ 1067 1068 /* 1069 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 1070 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 1071 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 1072 * recognized after hardware RESET or INIT IPI. 1073 */ 1074 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1075 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1076 vector, apic_id); 1077 if (!lapic_ipi_wait(100)) 1078 panic("Failed to deliver second STARTUP IPI to APIC %d", 1079 apic_id); 1080 1081 DELAY(200); /* wait ~200uS */ 1082} 1083 1084/* 1085 * Send an IPI to specified CPU handling the bitmap logic. 1086 */ 1087void 1088ipi_send_cpu(int cpu, u_int ipi) 1089{ 1090 u_int bitmap, old_pending, new_pending; 1091 1092 KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); 1093 1094 if (IPI_IS_BITMAPED(ipi)) { 1095 bitmap = 1 << ipi; 1096 ipi = IPI_BITMAP_VECTOR; 1097 do { 1098 old_pending = cpu_ipi_pending[cpu]; 1099 new_pending = old_pending | bitmap; 1100 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], 1101 old_pending, new_pending)); 1102 if (old_pending) 1103 return; 1104 } 1105 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); 1106} 1107 1108void 1109ipi_bitmap_handler(struct trapframe frame) 1110{ 1111 struct trapframe *oldframe; 1112 struct thread *td; 1113 int cpu = PCPU_GET(cpuid); 1114 u_int ipi_bitmap; 1115 1116 critical_enter(); 1117 td = curthread; 1118 td->td_intr_nesting_level++; 1119 oldframe = td->td_intr_frame; 1120 td->td_intr_frame = &frame; 1121 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 1122 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 1123#ifdef COUNT_IPIS 1124 (*ipi_preempt_counts[cpu])++; 1125#endif 1126 sched_preempt(td); 1127 } 1128 if (ipi_bitmap & (1 << IPI_AST)) { 1129#ifdef COUNT_IPIS 1130 (*ipi_ast_counts[cpu])++; 1131#endif 1132 /* Nothing to do for AST */ 1133 } 1134 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { 1135#ifdef COUNT_IPIS 1136 (*ipi_hardclock_counts[cpu])++; 1137#endif 1138 hardclockintr(); 1139 } 1140 td->td_intr_frame = oldframe; 1141 td->td_intr_nesting_level--; 1142 critical_exit(); 1143} 1144 1145/* 1146 * send an IPI to a set of cpus. 1147 */ 1148void 1149ipi_selected(cpuset_t cpus, u_int ipi) 1150{ 1151 int cpu; 1152 1153 /* 1154 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1155 * of help in order to understand what is the source. 1156 * Set the mask of receiving CPUs for this purpose. 1157 */ 1158 if (ipi == IPI_STOP_HARD) 1159 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus); 1160 1161 while ((cpu = CPU_FFS(&cpus)) != 0) { 1162 cpu--; 1163 CPU_CLR(cpu, &cpus); 1164 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1165 ipi_send_cpu(cpu, ipi); 1166 } 1167} 1168 1169/* 1170 * send an IPI to a specific CPU. 1171 */ 1172void 1173ipi_cpu(int cpu, u_int ipi) 1174{ 1175 1176 /* 1177 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1178 * of help in order to understand what is the source. 1179 * Set the mask of receiving CPUs for this purpose. 1180 */ 1181 if (ipi == IPI_STOP_HARD) 1182 CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending); 1183 1184 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1185 ipi_send_cpu(cpu, ipi); 1186} 1187 1188/* 1189 * send an IPI to all CPUs EXCEPT myself 1190 */ 1191void 1192ipi_all_but_self(u_int ipi) 1193{ 1194 cpuset_t other_cpus; 1195 1196 other_cpus = all_cpus; 1197 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1198 if (IPI_IS_BITMAPED(ipi)) { 1199 ipi_selected(other_cpus, ipi); 1200 return; 1201 } 1202 1203 /* 1204 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1205 * of help in order to understand what is the source. 1206 * Set the mask of receiving CPUs for this purpose. 1207 */ 1208 if (ipi == IPI_STOP_HARD) 1209 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus); 1210 1211 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1212 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 1213} 1214 1215int 1216ipi_nmi_handler(void) 1217{ 1218 u_int cpuid; 1219 1220 /* 1221 * As long as there is not a simple way to know about a NMI's 1222 * source, if the bitmask for the current CPU is present in 1223 * the global pending bitword an IPI_STOP_HARD has been issued 1224 * and should be handled. 1225 */ 1226 cpuid = PCPU_GET(cpuid); 1227 if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending)) 1228 return (1); 1229 1230 CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending); 1231 cpustop_handler(); 1232 return (0); 1233} 1234 1235int nmi_kdb_lock; 1236 1237void 1238nmi_call_kdb_smp(u_int type, struct trapframe *frame) 1239{ 1240 int cpu; 1241 bool call_post; 1242 1243 cpu = PCPU_GET(cpuid); 1244 if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) { 1245 nmi_call_kdb(cpu, type, frame); 1246 call_post = false; 1247 } else { 1248 savectx(&stoppcbs[cpu]); 1249 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1250 while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) 1251 ia32_pause(); 1252 call_post = true; 1253 } 1254 atomic_store_rel_int(&nmi_kdb_lock, 0); 1255 if (call_post) 1256 cpustop_handler_post(cpu); 1257} 1258 1259/* 1260 * Handle an IPI_STOP by saving our current context and spinning until we 1261 * are resumed. 1262 */ 1263void 1264cpustop_handler(void) 1265{ 1266 u_int cpu; 1267 1268 cpu = PCPU_GET(cpuid); 1269 1270 savectx(&stoppcbs[cpu]); 1271 1272 /* Indicate that we are stopped */ 1273 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1274 1275 /* Wait for restart */ 1276 while (!CPU_ISSET(cpu, &started_cpus)) 1277 ia32_pause(); 1278 1279 cpustop_handler_post(cpu); 1280} 1281 1282static void 1283cpustop_handler_post(u_int cpu) 1284{ 1285 1286 CPU_CLR_ATOMIC(cpu, &started_cpus); 1287 CPU_CLR_ATOMIC(cpu, &stopped_cpus); 1288 1289#if defined(__amd64__) && defined(DDB) 1290 amd64_db_resume_dbreg(); 1291#endif 1292 1293 if (cpu == 0 && cpustop_restartfunc != NULL) { 1294 cpustop_restartfunc(); 1295 cpustop_restartfunc = NULL; 1296 } 1297} 1298 1299/* 1300 * Handle an IPI_SUSPEND by saving our current context and spinning until we 1301 * are resumed. 1302 */ 1303void 1304cpususpend_handler(void) 1305{ 1306 u_int cpu; 1307 1308 mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); 1309 1310 cpu = PCPU_GET(cpuid); 1311 if (savectx(&susppcbs[cpu]->sp_pcb)) { 1312#ifdef __amd64__ 1313 fpususpend(susppcbs[cpu]->sp_fpususpend); 1314#else 1315 npxsuspend(susppcbs[cpu]->sp_fpususpend); 1316#endif 1317 /* 1318 * suspended_cpus is cleared shortly after each AP is restarted 1319 * by a Startup IPI, so that the BSP can proceed to restarting 1320 * the next AP. 1321 * 1322 * resuming_cpus gets cleared when the AP completes 1323 * initialization after having been released by the BSP. 1324 * resuming_cpus is probably not the best name for the 1325 * variable, because it is actually a set of processors that 1326 * haven't resumed yet and haven't necessarily started resuming. 1327 * 1328 * Note that suspended_cpus is meaningful only for ACPI suspend 1329 * as it's not really used for Xen suspend since the APs are 1330 * automatically restored to the running state and the correct 1331 * context. For the same reason resumectx is never called in 1332 * that case. 1333 */ 1334 CPU_SET_ATOMIC(cpu, &suspended_cpus); 1335 CPU_SET_ATOMIC(cpu, &resuming_cpus); 1336 1337 /* 1338 * Invalidate the cache after setting the global status bits. 1339 * The last AP to set its bit may end up being an Owner of the 1340 * corresponding cache line in MOESI protocol. The AP may be 1341 * stopped before the cache line is written to the main memory. 1342 */ 1343 wbinvd(); 1344 } else { 1345#ifdef __amd64__ 1346 fpuresume(susppcbs[cpu]->sp_fpususpend); 1347#else 1348 npxresume(susppcbs[cpu]->sp_fpususpend); 1349#endif 1350 pmap_init_pat(); 1351 initializecpu(); 1352 PCPU_SET(switchtime, 0); 1353 PCPU_SET(switchticks, ticks); 1354 1355 /* Indicate that we have restarted and restored the context. */ 1356 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1357 } 1358 1359 /* Wait for resume directive */ 1360 while (!CPU_ISSET(cpu, &toresume_cpus)) 1361 ia32_pause(); 1362 1363 if (cpu_ops.cpu_resume) 1364 cpu_ops.cpu_resume(); 1365#ifdef __amd64__ 1366 if (vmm_resume_p) 1367 vmm_resume_p(); 1368#endif 1369 1370 /* Resume MCA and local APIC */ 1371 lapic_xapic_mode(); 1372 mca_resume(); 1373 lapic_setup(0); 1374 1375 /* Indicate that we are resumed */ 1376 CPU_CLR_ATOMIC(cpu, &resuming_cpus); 1377 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1378 CPU_CLR_ATOMIC(cpu, &toresume_cpus); 1379} 1380 1381 1382void 1383invlcache_handler(void) 1384{ 1385 uint32_t generation; 1386 1387#ifdef COUNT_IPIS 1388 (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; 1389#endif /* COUNT_IPIS */ 1390 1391 /* 1392 * Reading the generation here allows greater parallelism 1393 * since wbinvd is a serializing instruction. Without the 1394 * temporary, we'd wait for wbinvd to complete, then the read 1395 * would execute, then the dependent write, which must then 1396 * complete before return from interrupt. 1397 */ 1398 generation = smp_tlb_generation; 1399 wbinvd(); 1400 PCPU_SET(smp_tlb_done, generation); 1401} 1402 1403/* 1404 * This is called once the rest of the system is up and running and we're 1405 * ready to let the AP's out of the pen. 1406 */ 1407static void 1408release_aps(void *dummy __unused) 1409{ 1410 1411 if (mp_ncpus == 1) 1412 return; 1413 atomic_store_rel_int(&aps_ready, 1); 1414 while (smp_started == 0) 1415 ia32_pause(); 1416} 1417SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1418 1419#ifdef COUNT_IPIS 1420/* 1421 * Setup interrupt counters for IPI handlers. 1422 */ 1423static void 1424mp_ipi_intrcnt(void *dummy) 1425{ 1426 char buf[64]; 1427 int i; 1428 1429 CPU_FOREACH(i) { 1430 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); 1431 intrcnt_add(buf, &ipi_invltlb_counts[i]); 1432 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); 1433 intrcnt_add(buf, &ipi_invlrng_counts[i]); 1434 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); 1435 intrcnt_add(buf, &ipi_invlpg_counts[i]); 1436 snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); 1437 intrcnt_add(buf, &ipi_invlcache_counts[i]); 1438 snprintf(buf, sizeof(buf), "cpu%d:preempt", i); 1439 intrcnt_add(buf, &ipi_preempt_counts[i]); 1440 snprintf(buf, sizeof(buf), "cpu%d:ast", i); 1441 intrcnt_add(buf, &ipi_ast_counts[i]); 1442 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); 1443 intrcnt_add(buf, &ipi_rendezvous_counts[i]); 1444 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); 1445 intrcnt_add(buf, &ipi_hardclock_counts[i]); 1446 } 1447} 1448SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); 1449#endif 1450 1451/* 1452 * Flush the TLB on other CPU's 1453 */ 1454 1455/* Variables needed for SMP tlb shootdown. */ 1456vm_offset_t smp_tlb_addr1, smp_tlb_addr2; 1457pmap_t smp_tlb_pmap; 1458volatile uint32_t smp_tlb_generation; 1459 1460#ifdef __amd64__ 1461#define read_eflags() read_rflags() 1462#endif 1463 1464static void 1465smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, 1466 vm_offset_t addr1, vm_offset_t addr2) 1467{ 1468 cpuset_t other_cpus; 1469 volatile uint32_t *p_cpudone; 1470 uint32_t generation; 1471 int cpu; 1472 1473 /* 1474 * Check for other cpus. Return if none. 1475 */ 1476 if (CPU_ISFULLSET(&mask)) { 1477 if (mp_ncpus <= 1) 1478 return; 1479 } else { 1480 CPU_CLR(PCPU_GET(cpuid), &mask); 1481 if (CPU_EMPTY(&mask)) 1482 return; 1483 } 1484 1485 if (!(read_eflags() & PSL_I)) 1486 panic("%s: interrupts disabled", __func__); 1487 mtx_lock_spin(&smp_ipi_mtx); 1488 smp_tlb_addr1 = addr1; 1489 smp_tlb_addr2 = addr2; 1490 smp_tlb_pmap = pmap; 1491 generation = ++smp_tlb_generation; 1492 if (CPU_ISFULLSET(&mask)) { 1493 ipi_all_but_self(vector); 1494 other_cpus = all_cpus; 1495 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1496 } else { 1497 other_cpus = mask; 1498 while ((cpu = CPU_FFS(&mask)) != 0) { 1499 cpu--; 1500 CPU_CLR(cpu, &mask); 1501 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, 1502 cpu, vector); 1503 ipi_send_cpu(cpu, vector); 1504 } 1505 } 1506 while ((cpu = CPU_FFS(&other_cpus)) != 0) { 1507 cpu--; 1508 CPU_CLR(cpu, &other_cpus); 1509 p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done; 1510 while (*p_cpudone != generation) 1511 ia32_pause(); 1512 } 1513 mtx_unlock_spin(&smp_ipi_mtx); 1514} 1515 1516void 1517smp_masked_invltlb(cpuset_t mask, pmap_t pmap) 1518{ 1519 1520 if (smp_started) { 1521 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0); 1522#ifdef COUNT_XINVLTLB_HITS 1523 ipi_global++; 1524#endif 1525 } 1526} 1527 1528void 1529smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap) 1530{ 1531 1532 if (smp_started) { 1533 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0); 1534#ifdef COUNT_XINVLTLB_HITS 1535 ipi_page++; 1536#endif 1537 } 1538} 1539 1540void 1541smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2, 1542 pmap_t pmap) 1543{ 1544 1545 if (smp_started) { 1546 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, 1547 addr1, addr2); 1548#ifdef COUNT_XINVLTLB_HITS 1549 ipi_range++; 1550 ipi_range_size += (addr2 - addr1) / PAGE_SIZE; 1551#endif 1552 } 1553} 1554 1555void 1556smp_cache_flush(void) 1557{ 1558 1559 if (smp_started) { 1560 smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL, 1561 0, 0); 1562 } 1563} 1564 1565/* 1566 * Handlers for TLB related IPIs 1567 */ 1568void 1569invltlb_handler(void) 1570{ 1571 uint32_t generation; 1572 1573#ifdef COUNT_XINVLTLB_HITS 1574 xhits_gbl[PCPU_GET(cpuid)]++; 1575#endif /* COUNT_XINVLTLB_HITS */ 1576#ifdef COUNT_IPIS 1577 (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; 1578#endif /* COUNT_IPIS */ 1579 1580 /* 1581 * Reading the generation here allows greater parallelism 1582 * since invalidating the TLB is a serializing operation. 1583 */ 1584 generation = smp_tlb_generation; 1585 if (smp_tlb_pmap == kernel_pmap) 1586 invltlb_glob(); 1587 else 1588 invltlb(); 1589 PCPU_SET(smp_tlb_done, generation); 1590} 1591 1592void 1593invlpg_handler(void) 1594{ 1595 uint32_t generation; 1596 1597#ifdef COUNT_XINVLTLB_HITS 1598 xhits_pg[PCPU_GET(cpuid)]++; 1599#endif /* COUNT_XINVLTLB_HITS */ 1600#ifdef COUNT_IPIS 1601 (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; 1602#endif /* COUNT_IPIS */ 1603 1604 generation = smp_tlb_generation; /* Overlap with serialization */ 1605 invlpg(smp_tlb_addr1); 1606 PCPU_SET(smp_tlb_done, generation); 1607} 1608 1609void 1610invlrng_handler(void) 1611{ 1612 vm_offset_t addr, addr2; 1613 uint32_t generation; 1614 1615#ifdef COUNT_XINVLTLB_HITS 1616 xhits_rng[PCPU_GET(cpuid)]++; 1617#endif /* COUNT_XINVLTLB_HITS */ 1618#ifdef COUNT_IPIS 1619 (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; 1620#endif /* COUNT_IPIS */ 1621 1622 addr = smp_tlb_addr1; 1623 addr2 = smp_tlb_addr2; 1624 generation = smp_tlb_generation; /* Overlap with serialization */ 1625 do { 1626 invlpg(addr); 1627 addr += PAGE_SIZE; 1628 } while (addr < addr2); 1629 1630 PCPU_SET(smp_tlb_done, generation); 1631} 1632