mp_x86.c revision 329863
1/*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2003, by Peter Wemm 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: stable/11/sys/x86/x86/mp_x86.c 329863 2018-02-23 11:17:16Z kib $"); 29 30#ifdef __i386__ 31#include "opt_apic.h" 32#endif 33#include "opt_cpu.h" 34#include "opt_isa.h" 35#include "opt_kstack_pages.h" 36#include "opt_pmap.h" 37#include "opt_sched.h" 38#include "opt_smp.h" 39 40#include <sys/param.h> 41#include <sys/systm.h> 42#include <sys/bus.h> 43#include <sys/cons.h> /* cngetc() */ 44#include <sys/cpuset.h> 45#ifdef GPROF 46#include <sys/gmon.h> 47#endif 48#include <sys/kernel.h> 49#include <sys/ktr.h> 50#include <sys/lock.h> 51#include <sys/malloc.h> 52#include <sys/memrange.h> 53#include <sys/mutex.h> 54#include <sys/pcpu.h> 55#include <sys/proc.h> 56#include <sys/sched.h> 57#include <sys/smp.h> 58#include <sys/sysctl.h> 59 60#include <vm/vm.h> 61#include <vm/vm_param.h> 62#include <vm/pmap.h> 63#include <vm/vm_kern.h> 64#include <vm/vm_extern.h> 65 66#include <x86/apicreg.h> 67#include <machine/clock.h> 68#include <machine/cputypes.h> 69#include <x86/mca.h> 70#include <machine/md_var.h> 71#include <machine/pcb.h> 72#include <machine/psl.h> 73#include <machine/smp.h> 74#include <machine/specialreg.h> 75#include <machine/cpu.h> 76 77/* lock region used by kernel profiling */ 78int mcount_lock; 79 80int mp_naps; /* # of Applications processors */ 81int boot_cpu_id = -1; /* designated BSP */ 82 83extern struct pcpu __pcpu[]; 84 85/* AP uses this during bootstrap. Do not staticize. */ 86char *bootSTK; 87int bootAP; 88 89/* Free these after use */ 90void *bootstacks[MAXCPU]; 91void *dpcpu; 92 93struct pcb stoppcbs[MAXCPU]; 94struct susppcb **susppcbs; 95 96#ifdef COUNT_IPIS 97/* Interrupt counts. */ 98static u_long *ipi_preempt_counts[MAXCPU]; 99static u_long *ipi_ast_counts[MAXCPU]; 100u_long *ipi_invltlb_counts[MAXCPU]; 101u_long *ipi_invlrng_counts[MAXCPU]; 102u_long *ipi_invlpg_counts[MAXCPU]; 103u_long *ipi_invlcache_counts[MAXCPU]; 104u_long *ipi_rendezvous_counts[MAXCPU]; 105static u_long *ipi_hardclock_counts[MAXCPU]; 106#endif 107 108/* Default cpu_ops implementation. */ 109struct cpu_ops cpu_ops; 110 111/* 112 * Local data and functions. 113 */ 114 115static volatile cpuset_t ipi_stop_nmi_pending; 116 117/* used to hold the AP's until we are ready to release them */ 118struct mtx ap_boot_mtx; 119 120/* Set to 1 once we're ready to let the APs out of the pen. */ 121volatile int aps_ready = 0; 122 123/* 124 * Store data from cpu_add() until later in the boot when we actually setup 125 * the APs. 126 */ 127struct cpu_info cpu_info[MAX_APIC_ID + 1]; 128int apic_cpuids[MAX_APIC_ID + 1]; 129int cpu_apic_ids[MAXCPU]; 130 131/* Holds pending bitmap based IPIs per CPU */ 132volatile u_int cpu_ipi_pending[MAXCPU]; 133 134static void release_aps(void *dummy); 135static void cpustop_handler_post(u_int cpu); 136 137static int hyperthreading_allowed = 1; 138SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN, 139 &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs"); 140 141static struct topo_node topo_root; 142 143static int pkg_id_shift; 144static int core_id_shift; 145static int disabled_cpus; 146 147struct cache_info { 148 int id_shift; 149 int present; 150} static caches[MAX_CACHE_LEVELS]; 151 152void 153mem_range_AP_init(void) 154{ 155 156 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 157 mem_range_softc.mr_op->initAP(&mem_range_softc); 158} 159 160/* 161 * Round up to the next power of two, if necessary, and then 162 * take log2. 163 * Returns -1 if argument is zero. 164 */ 165static __inline int 166mask_width(u_int x) 167{ 168 169 return (fls(x << (1 - powerof2(x))) - 1); 170} 171 172/* 173 * Add a cache level to the cache topology description. 174 */ 175static int 176add_deterministic_cache(int type, int level, int share_count) 177{ 178 179 if (type == 0) 180 return (0); 181 if (type > 3) { 182 printf("unexpected cache type %d\n", type); 183 return (1); 184 } 185 if (type == 2) /* ignore instruction cache */ 186 return (1); 187 if (level == 0 || level > MAX_CACHE_LEVELS) { 188 printf("unexpected cache level %d\n", type); 189 return (1); 190 } 191 192 if (caches[level - 1].present) { 193 printf("WARNING: multiple entries for L%u data cache\n", level); 194 printf("%u => %u\n", caches[level - 1].id_shift, 195 mask_width(share_count)); 196 } 197 caches[level - 1].id_shift = mask_width(share_count); 198 caches[level - 1].present = 1; 199 200 if (caches[level - 1].id_shift > pkg_id_shift) { 201 printf("WARNING: L%u data cache covers more " 202 "APIC IDs than a package\n", level); 203 printf("%u > %u\n", caches[level - 1].id_shift, pkg_id_shift); 204 caches[level - 1].id_shift = pkg_id_shift; 205 } 206 if (caches[level - 1].id_shift < core_id_shift) { 207 printf("WARNING: L%u data cache covers less " 208 "APIC IDs than a core\n", level); 209 printf("%u < %u\n", caches[level - 1].id_shift, core_id_shift); 210 caches[level - 1].id_shift = core_id_shift; 211 } 212 213 return (1); 214} 215 216/* 217 * Determine topology of processing units and caches for AMD CPUs. 218 * See: 219 * - AMD CPUID Specification (Publication # 25481) 220 * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559) 221 * - BKDG For AMD Family 10h Processors (Publication # 31116) 222 * - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301) 223 * - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751) 224 */ 225static void 226topo_probe_amd(void) 227{ 228 u_int p[4]; 229 uint64_t v; 230 int level; 231 int nodes_per_socket; 232 int share_count; 233 int type; 234 int i; 235 236 /* No multi-core capability. */ 237 if ((amd_feature2 & AMDID2_CMP) == 0) 238 return; 239 240 /* For families 10h and newer. */ 241 pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >> 242 AMDID_COREID_SIZE_SHIFT; 243 244 /* For 0Fh family. */ 245 if (pkg_id_shift == 0) 246 pkg_id_shift = 247 mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1); 248 249 /* 250 * Families prior to 16h define the following value as 251 * cores per compute unit and we don't really care about the AMD 252 * compute units at the moment. Perhaps we should treat them as 253 * cores and cores within the compute units as hardware threads, 254 * but that's up for debate. 255 * Later families define the value as threads per compute unit, 256 * so we are following AMD's nomenclature here. 257 */ 258 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 && 259 CPUID_TO_FAMILY(cpu_id) >= 0x16) { 260 cpuid_count(0x8000001e, 0, p); 261 share_count = ((p[1] >> 8) & 0xff) + 1; 262 core_id_shift = mask_width(share_count); 263 } 264 265 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) { 266 for (i = 0; ; i++) { 267 cpuid_count(0x8000001d, i, p); 268 type = p[0] & 0x1f; 269 level = (p[0] >> 5) & 0x7; 270 share_count = 1 + ((p[0] >> 14) & 0xfff); 271 272 if (!add_deterministic_cache(type, level, share_count)) 273 break; 274 } 275 } else { 276 if (cpu_exthigh >= 0x80000005) { 277 cpuid_count(0x80000005, 0, p); 278 if (((p[2] >> 24) & 0xff) != 0) { 279 caches[0].id_shift = 0; 280 caches[0].present = 1; 281 } 282 } 283 if (cpu_exthigh >= 0x80000006) { 284 cpuid_count(0x80000006, 0, p); 285 if (((p[2] >> 16) & 0xffff) != 0) { 286 caches[1].id_shift = 0; 287 caches[1].present = 1; 288 } 289 if (((p[3] >> 18) & 0x3fff) != 0) { 290 nodes_per_socket = 1; 291 if ((amd_feature2 & AMDID2_NODE_ID) != 0) { 292 /* 293 * Handle multi-node processors that 294 * have multiple chips, each with its 295 * own L3 cache, on the same die. 296 */ 297 v = rdmsr(0xc001100c); 298 nodes_per_socket = 1 + ((v >> 3) & 0x7); 299 } 300 caches[2].id_shift = 301 pkg_id_shift - mask_width(nodes_per_socket); 302 caches[2].present = 1; 303 } 304 } 305 } 306} 307 308/* 309 * Determine topology of processing units for Intel CPUs 310 * using CPUID Leaf 1 and Leaf 4, if supported. 311 * See: 312 * - Intel 64 Architecture Processor Topology Enumeration 313 * - Intel 64 and IA-32 ArchitecturesSoftware Developer���s Manual, 314 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 315 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 316 */ 317static void 318topo_probe_intel_0x4(void) 319{ 320 u_int p[4]; 321 int max_cores; 322 int max_logical; 323 324 /* Both zero and one here mean one logical processor per package. */ 325 max_logical = (cpu_feature & CPUID_HTT) != 0 ? 326 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; 327 if (max_logical <= 1) 328 return; 329 330 if (cpu_high >= 0x4) { 331 cpuid_count(0x04, 0, p); 332 max_cores = ((p[0] >> 26) & 0x3f) + 1; 333 } else 334 max_cores = 1; 335 336 core_id_shift = mask_width(max_logical/max_cores); 337 KASSERT(core_id_shift >= 0, 338 ("intel topo: max_cores > max_logical\n")); 339 pkg_id_shift = core_id_shift + mask_width(max_cores); 340} 341 342/* 343 * Determine topology of processing units for Intel CPUs 344 * using CPUID Leaf 11, if supported. 345 * See: 346 * - Intel 64 Architecture Processor Topology Enumeration 347 * - Intel 64 and IA-32 ArchitecturesSoftware Developer���s Manual, 348 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 349 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 350 */ 351static void 352topo_probe_intel_0xb(void) 353{ 354 u_int p[4]; 355 int bits; 356 int type; 357 int i; 358 359 /* Fall back if CPU leaf 11 doesn't really exist. */ 360 cpuid_count(0x0b, 0, p); 361 if (p[1] == 0) { 362 topo_probe_intel_0x4(); 363 return; 364 } 365 366 /* We only support three levels for now. */ 367 for (i = 0; ; i++) { 368 cpuid_count(0x0b, i, p); 369 370 bits = p[0] & 0x1f; 371 type = (p[2] >> 8) & 0xff; 372 373 if (type == 0) 374 break; 375 376 /* TODO: check for duplicate (re-)assignment */ 377 if (type == CPUID_TYPE_SMT) 378 core_id_shift = bits; 379 else if (type == CPUID_TYPE_CORE) 380 pkg_id_shift = bits; 381 else 382 printf("unknown CPU level type %d\n", type); 383 } 384 385 if (pkg_id_shift < core_id_shift) { 386 printf("WARNING: core covers more APIC IDs than a package\n"); 387 core_id_shift = pkg_id_shift; 388 } 389} 390 391/* 392 * Determine topology of caches for Intel CPUs. 393 * See: 394 * - Intel 64 Architecture Processor Topology Enumeration 395 * - Intel 64 and IA-32 Architectures Software Developer���s Manual 396 * Volume 2A: Instruction Set Reference, A-M, 397 * CPUID instruction 398 */ 399static void 400topo_probe_intel_caches(void) 401{ 402 u_int p[4]; 403 int level; 404 int share_count; 405 int type; 406 int i; 407 408 if (cpu_high < 0x4) { 409 /* 410 * Available cache level and sizes can be determined 411 * via CPUID leaf 2, but that requires a huge table of hardcoded 412 * values, so for now just assume L1 and L2 caches potentially 413 * shared only by HTT processing units, if HTT is present. 414 */ 415 caches[0].id_shift = pkg_id_shift; 416 caches[0].present = 1; 417 caches[1].id_shift = pkg_id_shift; 418 caches[1].present = 1; 419 return; 420 } 421 422 for (i = 0; ; i++) { 423 cpuid_count(0x4, i, p); 424 type = p[0] & 0x1f; 425 level = (p[0] >> 5) & 0x7; 426 share_count = 1 + ((p[0] >> 14) & 0xfff); 427 428 if (!add_deterministic_cache(type, level, share_count)) 429 break; 430 } 431} 432 433/* 434 * Determine topology of processing units and caches for Intel CPUs. 435 * See: 436 * - Intel 64 Architecture Processor Topology Enumeration 437 */ 438static void 439topo_probe_intel(void) 440{ 441 442 /* 443 * Note that 0x1 <= cpu_high < 4 case should be 444 * compatible with topo_probe_intel_0x4() logic when 445 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) 446 * or it should trigger the fallback otherwise. 447 */ 448 if (cpu_high >= 0xb) 449 topo_probe_intel_0xb(); 450 else if (cpu_high >= 0x1) 451 topo_probe_intel_0x4(); 452 453 topo_probe_intel_caches(); 454} 455 456/* 457 * Topology information is queried only on BSP, on which this 458 * code runs and for which it can query CPUID information. 459 * Then topology is extrapolated on all packages using an 460 * assumption that APIC ID to hardware component ID mapping is 461 * homogenious. 462 * That doesn't necesserily imply that the topology is uniform. 463 */ 464void 465topo_probe(void) 466{ 467 static int cpu_topo_probed = 0; 468 struct x86_topo_layer { 469 int type; 470 int subtype; 471 int id_shift; 472 } topo_layers[MAX_CACHE_LEVELS + 3]; 473 struct topo_node *parent; 474 struct topo_node *node; 475 int layer; 476 int nlayers; 477 int node_id; 478 int i; 479 480 if (cpu_topo_probed) 481 return; 482 483 CPU_ZERO(&logical_cpus_mask); 484 485 if (mp_ncpus <= 1) 486 ; /* nothing */ 487 else if (cpu_vendor_id == CPU_VENDOR_AMD) 488 topo_probe_amd(); 489 else if (cpu_vendor_id == CPU_VENDOR_INTEL) 490 topo_probe_intel(); 491 492 KASSERT(pkg_id_shift >= core_id_shift, 493 ("bug in APIC topology discovery")); 494 495 nlayers = 0; 496 bzero(topo_layers, sizeof(topo_layers)); 497 498 topo_layers[nlayers].type = TOPO_TYPE_PKG; 499 topo_layers[nlayers].id_shift = pkg_id_shift; 500 if (bootverbose) 501 printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift); 502 nlayers++; 503 504 /* 505 * Consider all caches to be within a package/chip 506 * and "in front" of all sub-components like 507 * cores and hardware threads. 508 */ 509 for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) { 510 if (caches[i].present) { 511 KASSERT(caches[i].id_shift <= pkg_id_shift, 512 ("bug in APIC topology discovery")); 513 KASSERT(caches[i].id_shift >= core_id_shift, 514 ("bug in APIC topology discovery")); 515 516 topo_layers[nlayers].type = TOPO_TYPE_CACHE; 517 topo_layers[nlayers].subtype = i + 1; 518 topo_layers[nlayers].id_shift = caches[i].id_shift; 519 if (bootverbose) 520 printf("L%u cache ID shift: %u\n", 521 topo_layers[nlayers].subtype, 522 topo_layers[nlayers].id_shift); 523 nlayers++; 524 } 525 } 526 527 if (pkg_id_shift > core_id_shift) { 528 topo_layers[nlayers].type = TOPO_TYPE_CORE; 529 topo_layers[nlayers].id_shift = core_id_shift; 530 if (bootverbose) 531 printf("Core ID shift: %u\n", 532 topo_layers[nlayers].id_shift); 533 nlayers++; 534 } 535 536 topo_layers[nlayers].type = TOPO_TYPE_PU; 537 topo_layers[nlayers].id_shift = 0; 538 nlayers++; 539 540 topo_init_root(&topo_root); 541 for (i = 0; i <= MAX_APIC_ID; ++i) { 542 if (!cpu_info[i].cpu_present) 543 continue; 544 545 parent = &topo_root; 546 for (layer = 0; layer < nlayers; ++layer) { 547 node_id = i >> topo_layers[layer].id_shift; 548 parent = topo_add_node_by_hwid(parent, node_id, 549 topo_layers[layer].type, 550 topo_layers[layer].subtype); 551 } 552 } 553 554 parent = &topo_root; 555 for (layer = 0; layer < nlayers; ++layer) { 556 node_id = boot_cpu_id >> topo_layers[layer].id_shift; 557 node = topo_find_node_by_hwid(parent, node_id, 558 topo_layers[layer].type, 559 topo_layers[layer].subtype); 560 topo_promote_child(node); 561 parent = node; 562 } 563 564 cpu_topo_probed = 1; 565} 566 567/* 568 * Assign logical CPU IDs to local APICs. 569 */ 570void 571assign_cpu_ids(void) 572{ 573 struct topo_node *node; 574 u_int smt_mask; 575 576 smt_mask = (1u << core_id_shift) - 1; 577 578 /* 579 * Assign CPU IDs to local APIC IDs and disable any CPUs 580 * beyond MAXCPU. CPU 0 is always assigned to the BSP. 581 */ 582 mp_ncpus = 0; 583 TOPO_FOREACH(node, &topo_root) { 584 if (node->type != TOPO_TYPE_PU) 585 continue; 586 587 if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask)) 588 cpu_info[node->hwid].cpu_hyperthread = 1; 589 590 if (resource_disabled("lapic", node->hwid)) { 591 if (node->hwid != boot_cpu_id) 592 cpu_info[node->hwid].cpu_disabled = 1; 593 else 594 printf("Cannot disable BSP, APIC ID = %d\n", 595 node->hwid); 596 } 597 598 if (!hyperthreading_allowed && 599 cpu_info[node->hwid].cpu_hyperthread) 600 cpu_info[node->hwid].cpu_disabled = 1; 601 602 if (mp_ncpus >= MAXCPU) 603 cpu_info[node->hwid].cpu_disabled = 1; 604 605 if (cpu_info[node->hwid].cpu_disabled) { 606 disabled_cpus++; 607 continue; 608 } 609 610 cpu_apic_ids[mp_ncpus] = node->hwid; 611 apic_cpuids[node->hwid] = mp_ncpus; 612 topo_set_pu_id(node, mp_ncpus); 613 mp_ncpus++; 614 } 615 616 KASSERT(mp_maxid >= mp_ncpus - 1, 617 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 618 mp_ncpus)); 619} 620 621/* 622 * Print various information about the SMP system hardware and setup. 623 */ 624void 625cpu_mp_announce(void) 626{ 627 struct topo_node *node; 628 const char *hyperthread; 629 int pkg_count; 630 int cores_per_pkg; 631 int thrs_per_core; 632 633 printf("FreeBSD/SMP: "); 634 if (topo_analyze(&topo_root, 1, &pkg_count, 635 &cores_per_pkg, &thrs_per_core)) { 636 printf("%d package(s)", pkg_count); 637 if (cores_per_pkg > 0) 638 printf(" x %d core(s)", cores_per_pkg); 639 if (thrs_per_core > 1) 640 printf(" x %d hardware threads", thrs_per_core); 641 } else { 642 printf("Non-uniform topology"); 643 } 644 printf("\n"); 645 646 if (disabled_cpus) { 647 printf("FreeBSD/SMP Online: "); 648 if (topo_analyze(&topo_root, 0, &pkg_count, 649 &cores_per_pkg, &thrs_per_core)) { 650 printf("%d package(s)", pkg_count); 651 if (cores_per_pkg > 0) 652 printf(" x %d core(s)", cores_per_pkg); 653 if (thrs_per_core > 1) 654 printf(" x %d hardware threads", thrs_per_core); 655 } else { 656 printf("Non-uniform topology"); 657 } 658 printf("\n"); 659 } 660 661 if (!bootverbose) 662 return; 663 664 TOPO_FOREACH(node, &topo_root) { 665 switch (node->type) { 666 case TOPO_TYPE_PKG: 667 printf("Package HW ID = %u (%#x)\n", 668 node->hwid, node->hwid); 669 break; 670 case TOPO_TYPE_CORE: 671 printf("\tCore HW ID = %u (%#x)\n", 672 node->hwid, node->hwid); 673 break; 674 case TOPO_TYPE_PU: 675 if (cpu_info[node->hwid].cpu_hyperthread) 676 hyperthread = "/HT"; 677 else 678 hyperthread = ""; 679 680 if (node->subtype == 0) 681 printf("\t\tCPU (AP%s): APIC ID: %u (%#x)" 682 "(disabled)\n", hyperthread, node->hwid, 683 node->hwid); 684 else if (node->id == 0) 685 printf("\t\tCPU0 (BSP): APIC ID: %u (%#x)\n", 686 node->hwid, node->hwid); 687 else 688 printf("\t\tCPU%u (AP%s): APIC ID: %u (%#x)\n", 689 node->id, hyperthread, node->hwid, 690 node->hwid); 691 break; 692 default: 693 /* ignored */ 694 break; 695 } 696 } 697} 698 699/* 700 * Add a scheduling group, a group of logical processors sharing 701 * a particular cache (and, thus having an affinity), to the scheduling 702 * topology. 703 * This function recursively works on lower level caches. 704 */ 705static void 706x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root) 707{ 708 struct topo_node *node; 709 int nchildren; 710 int ncores; 711 int i; 712 713 KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE, 714 ("x86topo_add_sched_group: bad type: %u", root->type)); 715 CPU_COPY(&root->cpuset, &cg_root->cg_mask); 716 cg_root->cg_count = root->cpu_count; 717 if (root->type == TOPO_TYPE_SYSTEM) 718 cg_root->cg_level = CG_SHARE_NONE; 719 else 720 cg_root->cg_level = root->subtype; 721 722 /* 723 * Check how many core nodes we have under the given root node. 724 * If we have multiple logical processors, but not multiple 725 * cores, then those processors must be hardware threads. 726 */ 727 ncores = 0; 728 node = root; 729 while (node != NULL) { 730 if (node->type != TOPO_TYPE_CORE) { 731 node = topo_next_node(root, node); 732 continue; 733 } 734 735 ncores++; 736 node = topo_next_nonchild_node(root, node); 737 } 738 739 if (cg_root->cg_level != CG_SHARE_NONE && 740 root->cpu_count > 1 && ncores < 2) 741 cg_root->cg_flags = CG_FLAG_SMT; 742 743 /* 744 * Find out how many cache nodes we have under the given root node. 745 * We ignore cache nodes that cover all the same processors as the 746 * root node. Also, we do not descend below found cache nodes. 747 * That is, we count top-level "non-redundant" caches under the root 748 * node. 749 */ 750 nchildren = 0; 751 node = root; 752 while (node != NULL) { 753 if (node->type != TOPO_TYPE_CACHE || 754 (root->type != TOPO_TYPE_SYSTEM && 755 CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { 756 node = topo_next_node(root, node); 757 continue; 758 } 759 nchildren++; 760 node = topo_next_nonchild_node(root, node); 761 } 762 763 cg_root->cg_child = smp_topo_alloc(nchildren); 764 cg_root->cg_children = nchildren; 765 766 /* 767 * Now find again the same cache nodes as above and recursively 768 * build scheduling topologies for them. 769 */ 770 node = root; 771 i = 0; 772 while (node != NULL) { 773 if (node->type != TOPO_TYPE_CACHE || 774 (root->type != TOPO_TYPE_SYSTEM && 775 CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { 776 node = topo_next_node(root, node); 777 continue; 778 } 779 cg_root->cg_child[i].cg_parent = cg_root; 780 x86topo_add_sched_group(node, &cg_root->cg_child[i]); 781 i++; 782 node = topo_next_nonchild_node(root, node); 783 } 784} 785 786/* 787 * Build the MI scheduling topology from the discovered hardware topology. 788 */ 789struct cpu_group * 790cpu_topo(void) 791{ 792 struct cpu_group *cg_root; 793 794 if (mp_ncpus <= 1) 795 return (smp_topo_none()); 796 797 cg_root = smp_topo_alloc(1); 798 x86topo_add_sched_group(&topo_root, cg_root); 799 return (cg_root); 800} 801 802 803/* 804 * Add a logical CPU to the topology. 805 */ 806void 807cpu_add(u_int apic_id, char boot_cpu) 808{ 809 810 if (apic_id > MAX_APIC_ID) { 811 panic("SMP: APIC ID %d too high", apic_id); 812 return; 813 } 814 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", 815 apic_id)); 816 cpu_info[apic_id].cpu_present = 1; 817 if (boot_cpu) { 818 KASSERT(boot_cpu_id == -1, 819 ("CPU %d claims to be BSP, but CPU %d already is", apic_id, 820 boot_cpu_id)); 821 boot_cpu_id = apic_id; 822 cpu_info[apic_id].cpu_bsp = 1; 823 } 824 if (mp_ncpus < MAXCPU) { 825 mp_ncpus++; 826 mp_maxid = mp_ncpus - 1; 827 } 828 if (bootverbose) 829 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : 830 "AP"); 831} 832 833void 834cpu_mp_setmaxid(void) 835{ 836 837 /* 838 * mp_ncpus and mp_maxid should be already set by calls to cpu_add(). 839 * If there were no calls to cpu_add() assume this is a UP system. 840 */ 841 if (mp_ncpus == 0) 842 mp_ncpus = 1; 843} 844 845int 846cpu_mp_probe(void) 847{ 848 849 /* 850 * Always record BSP in CPU map so that the mbuf init code works 851 * correctly. 852 */ 853 CPU_SETOF(0, &all_cpus); 854 return (mp_ncpus > 1); 855} 856 857/* 858 * AP CPU's call this to initialize themselves. 859 */ 860void 861init_secondary_tail(void) 862{ 863 u_int cpuid; 864 865 /* 866 * On real hardware, switch to x2apic mode if possible. Do it 867 * after aps_ready was signalled, to avoid manipulating the 868 * mode while BSP might still want to send some IPI to us 869 * (second startup IPI is ignored on modern hardware etc). 870 */ 871 lapic_xapic_mode(); 872 873 /* Initialize the PAT MSR. */ 874 pmap_init_pat(); 875 876 /* set up CPU registers and state */ 877 cpu_setregs(); 878 879 /* set up SSE/NX */ 880 initializecpu(); 881 882 /* set up FPU state on the AP */ 883#ifdef __amd64__ 884 fpuinit(); 885#else 886 npxinit(false); 887#endif 888 889 if (cpu_ops.cpu_init) 890 cpu_ops.cpu_init(); 891 892 /* A quick check from sanity claus */ 893 cpuid = PCPU_GET(cpuid); 894 if (PCPU_GET(apic_id) != lapic_id()) { 895 printf("SMP: cpuid = %d\n", cpuid); 896 printf("SMP: actual apic_id = %d\n", lapic_id()); 897 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 898 panic("cpuid mismatch! boom!!"); 899 } 900 901 /* Initialize curthread. */ 902 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 903 PCPU_SET(curthread, PCPU_GET(idlethread)); 904 905 mca_init(); 906 907 mtx_lock_spin(&ap_boot_mtx); 908 909 /* Init local apic for irq's */ 910 lapic_setup(1); 911 912 /* Set memory range attributes for this CPU to match the BSP */ 913 mem_range_AP_init(); 914 915 smp_cpus++; 916 917 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); 918 printf("SMP: AP CPU #%d Launched!\n", cpuid); 919 920 /* Determine if we are a logical CPU. */ 921 if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread) 922 CPU_SET(cpuid, &logical_cpus_mask); 923 924 if (bootverbose) 925 lapic_dump("AP"); 926 927 if (smp_cpus == mp_ncpus) { 928 /* enable IPI's, tlb shootdown, freezes etc */ 929 atomic_store_rel_int(&smp_started, 1); 930 } 931 932#ifdef __amd64__ 933 /* 934 * Enable global pages TLB extension 935 * This also implicitly flushes the TLB 936 */ 937 load_cr4(rcr4() | CR4_PGE); 938 if (pmap_pcid_enabled) 939 load_cr4(rcr4() | CR4_PCIDE); 940 load_ds(_udatasel); 941 load_es(_udatasel); 942 load_fs(_ufssel); 943#endif 944 945 mtx_unlock_spin(&ap_boot_mtx); 946 947 /* Wait until all the AP's are up. */ 948 while (atomic_load_acq_int(&smp_started) == 0) 949 ia32_pause(); 950 951#ifndef EARLY_AP_STARTUP 952 /* Start per-CPU event timers. */ 953 cpu_initclocks_ap(); 954#endif 955 956 sched_throw(NULL); 957 958 panic("scheduler returned us to %s", __func__); 959 /* NOTREACHED */ 960} 961 962/******************************************************************* 963 * local functions and data 964 */ 965 966/* 967 * We tell the I/O APIC code about all the CPUs we want to receive 968 * interrupts. If we don't want certain CPUs to receive IRQs we 969 * can simply not tell the I/O APIC code about them in this function. 970 * We also do not tell it about the BSP since it tells itself about 971 * the BSP internally to work with UP kernels and on UP machines. 972 */ 973void 974set_interrupt_apic_ids(void) 975{ 976 u_int i, apic_id; 977 978 for (i = 0; i < MAXCPU; i++) { 979 apic_id = cpu_apic_ids[i]; 980 if (apic_id == -1) 981 continue; 982 if (cpu_info[apic_id].cpu_bsp) 983 continue; 984 if (cpu_info[apic_id].cpu_disabled) 985 continue; 986 987 /* Don't let hyperthreads service interrupts. */ 988 if (cpu_info[apic_id].cpu_hyperthread) 989 continue; 990 991 intr_add_cpu(i); 992 } 993} 994 995 996#ifdef COUNT_XINVLTLB_HITS 997u_int xhits_gbl[MAXCPU]; 998u_int xhits_pg[MAXCPU]; 999u_int xhits_rng[MAXCPU]; 1000static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); 1001SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, 1002 sizeof(xhits_gbl), "IU", ""); 1003SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, 1004 sizeof(xhits_pg), "IU", ""); 1005SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, 1006 sizeof(xhits_rng), "IU", ""); 1007 1008u_int ipi_global; 1009u_int ipi_page; 1010u_int ipi_range; 1011u_int ipi_range_size; 1012SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); 1013SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); 1014SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); 1015SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 1016 0, ""); 1017#endif /* COUNT_XINVLTLB_HITS */ 1018 1019/* 1020 * Init and startup IPI. 1021 */ 1022void 1023ipi_startup(int apic_id, int vector) 1024{ 1025 1026 /* 1027 * This attempts to follow the algorithm described in the 1028 * Intel Multiprocessor Specification v1.4 in section B.4. 1029 * For each IPI, we allow the local APIC ~20us to deliver the 1030 * IPI. If that times out, we panic. 1031 */ 1032 1033 /* 1034 * first we do an INIT IPI: this INIT IPI might be run, resetting 1035 * and running the target CPU. OR this INIT IPI might be latched (P5 1036 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 1037 * ignored. 1038 */ 1039 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1040 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 1041 lapic_ipi_wait(100); 1042 1043 /* Explicitly deassert the INIT IPI. */ 1044 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1045 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 1046 apic_id); 1047 1048 DELAY(10000); /* wait ~10mS */ 1049 1050 /* 1051 * next we do a STARTUP IPI: the previous INIT IPI might still be 1052 * latched, (P5 bug) this 1st STARTUP would then terminate 1053 * immediately, and the previously started INIT IPI would continue. OR 1054 * the previous INIT IPI has already run. and this STARTUP IPI will 1055 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 1056 * will run. 1057 */ 1058 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1059 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1060 vector, apic_id); 1061 if (!lapic_ipi_wait(100)) 1062 panic("Failed to deliver first STARTUP IPI to APIC %d", 1063 apic_id); 1064 DELAY(200); /* wait ~200uS */ 1065 1066 /* 1067 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 1068 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 1069 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 1070 * recognized after hardware RESET or INIT IPI. 1071 */ 1072 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1073 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1074 vector, apic_id); 1075 if (!lapic_ipi_wait(100)) 1076 panic("Failed to deliver second STARTUP IPI to APIC %d", 1077 apic_id); 1078 1079 DELAY(200); /* wait ~200uS */ 1080} 1081 1082/* 1083 * Send an IPI to specified CPU handling the bitmap logic. 1084 */ 1085void 1086ipi_send_cpu(int cpu, u_int ipi) 1087{ 1088 u_int bitmap, old_pending, new_pending; 1089 1090 KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); 1091 1092 if (IPI_IS_BITMAPED(ipi)) { 1093 bitmap = 1 << ipi; 1094 ipi = IPI_BITMAP_VECTOR; 1095 do { 1096 old_pending = cpu_ipi_pending[cpu]; 1097 new_pending = old_pending | bitmap; 1098 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], 1099 old_pending, new_pending)); 1100 if (old_pending) 1101 return; 1102 } 1103 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); 1104} 1105 1106void 1107ipi_bitmap_handler(struct trapframe frame) 1108{ 1109 struct trapframe *oldframe; 1110 struct thread *td; 1111 int cpu = PCPU_GET(cpuid); 1112 u_int ipi_bitmap; 1113 1114 critical_enter(); 1115 td = curthread; 1116 td->td_intr_nesting_level++; 1117 oldframe = td->td_intr_frame; 1118 td->td_intr_frame = &frame; 1119 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 1120 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 1121#ifdef COUNT_IPIS 1122 (*ipi_preempt_counts[cpu])++; 1123#endif 1124 sched_preempt(td); 1125 } 1126 if (ipi_bitmap & (1 << IPI_AST)) { 1127#ifdef COUNT_IPIS 1128 (*ipi_ast_counts[cpu])++; 1129#endif 1130 /* Nothing to do for AST */ 1131 } 1132 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { 1133#ifdef COUNT_IPIS 1134 (*ipi_hardclock_counts[cpu])++; 1135#endif 1136 hardclockintr(); 1137 } 1138 td->td_intr_frame = oldframe; 1139 td->td_intr_nesting_level--; 1140 critical_exit(); 1141} 1142 1143/* 1144 * send an IPI to a set of cpus. 1145 */ 1146void 1147ipi_selected(cpuset_t cpus, u_int ipi) 1148{ 1149 int cpu; 1150 1151 /* 1152 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1153 * of help in order to understand what is the source. 1154 * Set the mask of receiving CPUs for this purpose. 1155 */ 1156 if (ipi == IPI_STOP_HARD) 1157 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus); 1158 1159 while ((cpu = CPU_FFS(&cpus)) != 0) { 1160 cpu--; 1161 CPU_CLR(cpu, &cpus); 1162 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1163 ipi_send_cpu(cpu, ipi); 1164 } 1165} 1166 1167/* 1168 * send an IPI to a specific CPU. 1169 */ 1170void 1171ipi_cpu(int cpu, u_int ipi) 1172{ 1173 1174 /* 1175 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1176 * of help in order to understand what is the source. 1177 * Set the mask of receiving CPUs for this purpose. 1178 */ 1179 if (ipi == IPI_STOP_HARD) 1180 CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending); 1181 1182 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1183 ipi_send_cpu(cpu, ipi); 1184} 1185 1186/* 1187 * send an IPI to all CPUs EXCEPT myself 1188 */ 1189void 1190ipi_all_but_self(u_int ipi) 1191{ 1192 cpuset_t other_cpus; 1193 1194 other_cpus = all_cpus; 1195 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1196 if (IPI_IS_BITMAPED(ipi)) { 1197 ipi_selected(other_cpus, ipi); 1198 return; 1199 } 1200 1201 /* 1202 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1203 * of help in order to understand what is the source. 1204 * Set the mask of receiving CPUs for this purpose. 1205 */ 1206 if (ipi == IPI_STOP_HARD) 1207 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus); 1208 1209 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1210 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 1211} 1212 1213int 1214ipi_nmi_handler(void) 1215{ 1216 u_int cpuid; 1217 1218 /* 1219 * As long as there is not a simple way to know about a NMI's 1220 * source, if the bitmask for the current CPU is present in 1221 * the global pending bitword an IPI_STOP_HARD has been issued 1222 * and should be handled. 1223 */ 1224 cpuid = PCPU_GET(cpuid); 1225 if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending)) 1226 return (1); 1227 1228 CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending); 1229 cpustop_handler(); 1230 return (0); 1231} 1232 1233#ifdef DEV_ISA 1234int nmi_kdb_lock; 1235 1236void 1237nmi_call_kdb_smp(u_int type, struct trapframe *frame) 1238{ 1239 int cpu; 1240 bool call_post; 1241 1242 cpu = PCPU_GET(cpuid); 1243 if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) { 1244 nmi_call_kdb(cpu, type, frame); 1245 call_post = false; 1246 } else { 1247 savectx(&stoppcbs[cpu]); 1248 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1249 while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) 1250 ia32_pause(); 1251 call_post = true; 1252 } 1253 atomic_store_rel_int(&nmi_kdb_lock, 0); 1254 if (call_post) 1255 cpustop_handler_post(cpu); 1256} 1257#endif 1258 1259/* 1260 * Handle an IPI_STOP by saving our current context and spinning until we 1261 * are resumed. 1262 */ 1263void 1264cpustop_handler(void) 1265{ 1266 u_int cpu; 1267 1268 cpu = PCPU_GET(cpuid); 1269 1270 savectx(&stoppcbs[cpu]); 1271 1272 /* Indicate that we are stopped */ 1273 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1274 1275 /* Wait for restart */ 1276 while (!CPU_ISSET(cpu, &started_cpus)) 1277 ia32_pause(); 1278 1279 cpustop_handler_post(cpu); 1280} 1281 1282static void 1283cpustop_handler_post(u_int cpu) 1284{ 1285 1286 CPU_CLR_ATOMIC(cpu, &started_cpus); 1287 CPU_CLR_ATOMIC(cpu, &stopped_cpus); 1288 1289#if defined(__amd64__) && defined(DDB) 1290 amd64_db_resume_dbreg(); 1291#endif 1292 1293 if (cpu == 0 && cpustop_restartfunc != NULL) { 1294 cpustop_restartfunc(); 1295 cpustop_restartfunc = NULL; 1296 } 1297} 1298 1299/* 1300 * Handle an IPI_SUSPEND by saving our current context and spinning until we 1301 * are resumed. 1302 */ 1303void 1304cpususpend_handler(void) 1305{ 1306 u_int cpu; 1307 1308 mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); 1309 1310 cpu = PCPU_GET(cpuid); 1311 if (savectx(&susppcbs[cpu]->sp_pcb)) { 1312#ifdef __amd64__ 1313 fpususpend(susppcbs[cpu]->sp_fpususpend); 1314#else 1315 npxsuspend(susppcbs[cpu]->sp_fpususpend); 1316#endif 1317 wbinvd(); 1318 CPU_SET_ATOMIC(cpu, &suspended_cpus); 1319 } else { 1320#ifdef __amd64__ 1321 fpuresume(susppcbs[cpu]->sp_fpususpend); 1322#else 1323 npxresume(susppcbs[cpu]->sp_fpususpend); 1324#endif 1325 pmap_init_pat(); 1326 initializecpu(); 1327 PCPU_SET(switchtime, 0); 1328 PCPU_SET(switchticks, ticks); 1329 1330 /* Indicate that we are resumed */ 1331 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1332 } 1333 1334 /* Wait for resume */ 1335 while (!CPU_ISSET(cpu, &started_cpus)) 1336 ia32_pause(); 1337 1338 if (cpu_ops.cpu_resume) 1339 cpu_ops.cpu_resume(); 1340#ifdef __amd64__ 1341 if (vmm_resume_p) 1342 vmm_resume_p(); 1343#endif 1344 1345 /* Resume MCA and local APIC */ 1346 lapic_xapic_mode(); 1347 mca_resume(); 1348 lapic_setup(0); 1349 1350 /* Indicate that we are resumed */ 1351 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1352 CPU_CLR_ATOMIC(cpu, &started_cpus); 1353} 1354 1355 1356void 1357invlcache_handler(void) 1358{ 1359 uint32_t generation; 1360 1361#ifdef COUNT_IPIS 1362 (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; 1363#endif /* COUNT_IPIS */ 1364 1365 /* 1366 * Reading the generation here allows greater parallelism 1367 * since wbinvd is a serializing instruction. Without the 1368 * temporary, we'd wait for wbinvd to complete, then the read 1369 * would execute, then the dependent write, which must then 1370 * complete before return from interrupt. 1371 */ 1372 generation = smp_tlb_generation; 1373 wbinvd(); 1374 PCPU_SET(smp_tlb_done, generation); 1375} 1376 1377/* 1378 * This is called once the rest of the system is up and running and we're 1379 * ready to let the AP's out of the pen. 1380 */ 1381static void 1382release_aps(void *dummy __unused) 1383{ 1384 1385 if (mp_ncpus == 1) 1386 return; 1387 atomic_store_rel_int(&aps_ready, 1); 1388 while (smp_started == 0) 1389 ia32_pause(); 1390} 1391SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1392 1393#ifdef COUNT_IPIS 1394/* 1395 * Setup interrupt counters for IPI handlers. 1396 */ 1397static void 1398mp_ipi_intrcnt(void *dummy) 1399{ 1400 char buf[64]; 1401 int i; 1402 1403 CPU_FOREACH(i) { 1404 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); 1405 intrcnt_add(buf, &ipi_invltlb_counts[i]); 1406 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); 1407 intrcnt_add(buf, &ipi_invlrng_counts[i]); 1408 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); 1409 intrcnt_add(buf, &ipi_invlpg_counts[i]); 1410 snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); 1411 intrcnt_add(buf, &ipi_invlcache_counts[i]); 1412 snprintf(buf, sizeof(buf), "cpu%d:preempt", i); 1413 intrcnt_add(buf, &ipi_preempt_counts[i]); 1414 snprintf(buf, sizeof(buf), "cpu%d:ast", i); 1415 intrcnt_add(buf, &ipi_ast_counts[i]); 1416 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); 1417 intrcnt_add(buf, &ipi_rendezvous_counts[i]); 1418 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); 1419 intrcnt_add(buf, &ipi_hardclock_counts[i]); 1420 } 1421} 1422SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); 1423#endif 1424 1425/* 1426 * Flush the TLB on other CPU's 1427 */ 1428 1429/* Variables needed for SMP tlb shootdown. */ 1430vm_offset_t smp_tlb_addr1, smp_tlb_addr2; 1431pmap_t smp_tlb_pmap; 1432volatile uint32_t smp_tlb_generation; 1433 1434#ifdef __amd64__ 1435#define read_eflags() read_rflags() 1436#endif 1437 1438static void 1439smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, 1440 vm_offset_t addr1, vm_offset_t addr2) 1441{ 1442 cpuset_t other_cpus; 1443 volatile uint32_t *p_cpudone; 1444 uint32_t generation; 1445 int cpu; 1446 1447 /* 1448 * Check for other cpus. Return if none. 1449 */ 1450 if (CPU_ISFULLSET(&mask)) { 1451 if (mp_ncpus <= 1) 1452 return; 1453 } else { 1454 CPU_CLR(PCPU_GET(cpuid), &mask); 1455 if (CPU_EMPTY(&mask)) 1456 return; 1457 } 1458 1459 if (!(read_eflags() & PSL_I)) 1460 panic("%s: interrupts disabled", __func__); 1461 mtx_lock_spin(&smp_ipi_mtx); 1462 smp_tlb_addr1 = addr1; 1463 smp_tlb_addr2 = addr2; 1464 smp_tlb_pmap = pmap; 1465 generation = ++smp_tlb_generation; 1466 if (CPU_ISFULLSET(&mask)) { 1467 ipi_all_but_self(vector); 1468 other_cpus = all_cpus; 1469 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1470 } else { 1471 other_cpus = mask; 1472 while ((cpu = CPU_FFS(&mask)) != 0) { 1473 cpu--; 1474 CPU_CLR(cpu, &mask); 1475 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, 1476 cpu, vector); 1477 ipi_send_cpu(cpu, vector); 1478 } 1479 } 1480 while ((cpu = CPU_FFS(&other_cpus)) != 0) { 1481 cpu--; 1482 CPU_CLR(cpu, &other_cpus); 1483 p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done; 1484 while (*p_cpudone != generation) 1485 ia32_pause(); 1486 } 1487 mtx_unlock_spin(&smp_ipi_mtx); 1488} 1489 1490void 1491smp_masked_invltlb(cpuset_t mask, pmap_t pmap) 1492{ 1493 1494 if (smp_started) { 1495 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0); 1496#ifdef COUNT_XINVLTLB_HITS 1497 ipi_global++; 1498#endif 1499 } 1500} 1501 1502void 1503smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap) 1504{ 1505 1506 if (smp_started) { 1507 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0); 1508#ifdef COUNT_XINVLTLB_HITS 1509 ipi_page++; 1510#endif 1511 } 1512} 1513 1514void 1515smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2, 1516 pmap_t pmap) 1517{ 1518 1519 if (smp_started) { 1520 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, 1521 addr1, addr2); 1522#ifdef COUNT_XINVLTLB_HITS 1523 ipi_range++; 1524 ipi_range_size += (addr2 - addr1) / PAGE_SIZE; 1525#endif 1526 } 1527} 1528 1529void 1530smp_cache_flush(void) 1531{ 1532 1533 if (smp_started) { 1534 smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL, 1535 0, 0); 1536 } 1537} 1538 1539/* 1540 * Handlers for TLB related IPIs 1541 */ 1542void 1543invltlb_handler(void) 1544{ 1545 uint32_t generation; 1546 1547#ifdef COUNT_XINVLTLB_HITS 1548 xhits_gbl[PCPU_GET(cpuid)]++; 1549#endif /* COUNT_XINVLTLB_HITS */ 1550#ifdef COUNT_IPIS 1551 (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; 1552#endif /* COUNT_IPIS */ 1553 1554 /* 1555 * Reading the generation here allows greater parallelism 1556 * since invalidating the TLB is a serializing operation. 1557 */ 1558 generation = smp_tlb_generation; 1559 if (smp_tlb_pmap == kernel_pmap) 1560 invltlb_glob(); 1561 else 1562 invltlb(); 1563 PCPU_SET(smp_tlb_done, generation); 1564} 1565 1566void 1567invlpg_handler(void) 1568{ 1569 uint32_t generation; 1570 1571#ifdef COUNT_XINVLTLB_HITS 1572 xhits_pg[PCPU_GET(cpuid)]++; 1573#endif /* COUNT_XINVLTLB_HITS */ 1574#ifdef COUNT_IPIS 1575 (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; 1576#endif /* COUNT_IPIS */ 1577 1578 generation = smp_tlb_generation; /* Overlap with serialization */ 1579 invlpg(smp_tlb_addr1); 1580 PCPU_SET(smp_tlb_done, generation); 1581} 1582 1583void 1584invlrng_handler(void) 1585{ 1586 vm_offset_t addr, addr2; 1587 uint32_t generation; 1588 1589#ifdef COUNT_XINVLTLB_HITS 1590 xhits_rng[PCPU_GET(cpuid)]++; 1591#endif /* COUNT_XINVLTLB_HITS */ 1592#ifdef COUNT_IPIS 1593 (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; 1594#endif /* COUNT_IPIS */ 1595 1596 addr = smp_tlb_addr1; 1597 addr2 = smp_tlb_addr2; 1598 generation = smp_tlb_generation; /* Overlap with serialization */ 1599 do { 1600 invlpg(addr); 1601 addr += PAGE_SIZE; 1602 } while (addr < addr2); 1603 1604 PCPU_SET(smp_tlb_done, generation); 1605} 1606