mp_x86.c revision 297577
1/*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2003, by Peter Wemm 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/x86/x86/mp_x86.c 297577 2016-04-05 10:36:40Z avg $"); 29 30#ifdef __i386__ 31#include "opt_apic.h" 32#endif 33#include "opt_cpu.h" 34#include "opt_kstack_pages.h" 35#include "opt_pmap.h" 36#include "opt_sched.h" 37#include "opt_smp.h" 38 39#include <sys/param.h> 40#include <sys/systm.h> 41#include <sys/bus.h> 42#include <sys/cons.h> /* cngetc() */ 43#include <sys/cpuset.h> 44#ifdef GPROF 45#include <sys/gmon.h> 46#endif 47#include <sys/kernel.h> 48#include <sys/ktr.h> 49#include <sys/lock.h> 50#include <sys/malloc.h> 51#include <sys/memrange.h> 52#include <sys/mutex.h> 53#include <sys/pcpu.h> 54#include <sys/proc.h> 55#include <sys/sched.h> 56#include <sys/smp.h> 57#include <sys/sysctl.h> 58 59#include <vm/vm.h> 60#include <vm/vm_param.h> 61#include <vm/pmap.h> 62#include <vm/vm_kern.h> 63#include <vm/vm_extern.h> 64 65#include <x86/apicreg.h> 66#include <machine/clock.h> 67#include <machine/cputypes.h> 68#include <x86/mca.h> 69#include <machine/md_var.h> 70#include <machine/pcb.h> 71#include <machine/psl.h> 72#include <machine/smp.h> 73#include <machine/specialreg.h> 74#include <machine/cpu.h> 75 76#define WARMBOOT_TARGET 0 77#define WARMBOOT_OFF (KERNBASE + 0x0467) 78#define WARMBOOT_SEG (KERNBASE + 0x0469) 79 80#define CMOS_REG (0x70) 81#define CMOS_DATA (0x71) 82#define BIOS_RESET (0x0f) 83#define BIOS_WARM (0x0a) 84 85/* lock region used by kernel profiling */ 86int mcount_lock; 87 88int mp_naps; /* # of Applications processors */ 89int boot_cpu_id = -1; /* designated BSP */ 90 91extern struct pcpu __pcpu[]; 92 93/* AP uses this during bootstrap. Do not staticize. */ 94char *bootSTK; 95int bootAP; 96 97/* Free these after use */ 98void *bootstacks[MAXCPU]; 99void *dpcpu; 100 101struct pcb stoppcbs[MAXCPU]; 102struct susppcb **susppcbs; 103 104#ifdef COUNT_IPIS 105/* Interrupt counts. */ 106static u_long *ipi_preempt_counts[MAXCPU]; 107static u_long *ipi_ast_counts[MAXCPU]; 108u_long *ipi_invltlb_counts[MAXCPU]; 109u_long *ipi_invlrng_counts[MAXCPU]; 110u_long *ipi_invlpg_counts[MAXCPU]; 111u_long *ipi_invlcache_counts[MAXCPU]; 112u_long *ipi_rendezvous_counts[MAXCPU]; 113static u_long *ipi_hardclock_counts[MAXCPU]; 114#endif 115 116/* Default cpu_ops implementation. */ 117struct cpu_ops cpu_ops; 118 119/* 120 * Local data and functions. 121 */ 122 123static volatile cpuset_t ipi_stop_nmi_pending; 124 125/* used to hold the AP's until we are ready to release them */ 126struct mtx ap_boot_mtx; 127 128/* Set to 1 once we're ready to let the APs out of the pen. */ 129volatile int aps_ready = 0; 130 131/* 132 * Store data from cpu_add() until later in the boot when we actually setup 133 * the APs. 134 */ 135struct cpu_info cpu_info[MAX_APIC_ID + 1]; 136int apic_cpuids[MAX_APIC_ID + 1]; 137int cpu_apic_ids[MAXCPU]; 138 139/* Holds pending bitmap based IPIs per CPU */ 140volatile u_int cpu_ipi_pending[MAXCPU]; 141 142static void release_aps(void *dummy); 143 144static int hyperthreading_allowed = 1; 145SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN, 146 &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs"); 147 148static struct topo_node topo_root; 149 150static int pkg_id_shift; 151static int core_id_shift; 152static int disabled_cpus; 153 154struct cache_info { 155 int id_shift; 156 int present; 157} static caches[MAX_CACHE_LEVELS]; 158 159void 160mem_range_AP_init(void) 161{ 162 163 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 164 mem_range_softc.mr_op->initAP(&mem_range_softc); 165} 166 167/* 168 * Round up to the next power of two, if necessary, and then 169 * take log2. 170 * Returns -1 if argument is zero. 171 */ 172static __inline int 173mask_width(u_int x) 174{ 175 176 return (fls(x << (1 - powerof2(x))) - 1); 177} 178 179/* 180 * Add a cache level to the cache topology description. 181 */ 182static int 183add_deterministic_cache(int type, int level, int share_count) 184{ 185 186 if (type == 0) 187 return (0); 188 if (type > 3) { 189 printf("unexpected cache type %d\n", type); 190 return (1); 191 } 192 if (type == 2) /* ignore instruction cache */ 193 return (1); 194 if (level == 0 || level > MAX_CACHE_LEVELS) { 195 printf("unexpected cache level %d\n", type); 196 return (1); 197 } 198 199 if (caches[level - 1].present) { 200 printf("WARNING: multiple entries for L%u data cache\n", level); 201 printf("%u => %u\n", caches[level - 1].id_shift, 202 mask_width(share_count)); 203 } 204 caches[level - 1].id_shift = mask_width(share_count); 205 caches[level - 1].present = 1; 206 207 if (caches[level - 1].id_shift > pkg_id_shift) { 208 printf("WARNING: L%u data cache covers more " 209 "APIC IDs than a package\n", level); 210 printf("%u > %u\n", caches[level - 1].id_shift, pkg_id_shift); 211 caches[level - 1].id_shift = pkg_id_shift; 212 } 213 if (caches[level - 1].id_shift < core_id_shift) { 214 printf("WARNING: L%u data cache covers less " 215 "APIC IDs than a core\n", level); 216 printf("%u < %u\n", caches[level - 1].id_shift, core_id_shift); 217 caches[level - 1].id_shift = core_id_shift; 218 } 219 220 return (1); 221} 222 223/* 224 * Determine topology of processing units and caches for AMD CPUs. 225 * See: 226 * - AMD CPUID Specification (Publication # 25481) 227 * - BKDG For AMD Family 10h Processors (Publication # 31116), section 2.15 228 * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559) 229 * XXX At the moment the code does not recognize grouping of AMD CMT threads, 230 * if supported, into cores, so each thread is treated as being in its own 231 * core. In other words, each logical CPU is considered to be a core. 232 */ 233static void 234topo_probe_amd(void) 235{ 236 u_int p[4]; 237 int level; 238 int share_count; 239 int type; 240 int i; 241 242 /* No multi-core capability. */ 243 if ((amd_feature2 & AMDID2_CMP) == 0) 244 return; 245 246 /* For families 10h and newer. */ 247 pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >> 248 AMDID_COREID_SIZE_SHIFT; 249 250 /* For 0Fh family. */ 251 if (pkg_id_shift == 0) 252 pkg_id_shift = 253 mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1); 254 255 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) { 256 for (i = 0; ; i++) { 257 cpuid_count(0x8000001d, i, p); 258 type = p[0] & 0x1f; 259 level = (p[0] >> 5) & 0x7; 260 share_count = 1 + ((p[0] >> 14) & 0xfff); 261 262 if (!add_deterministic_cache(type, level, share_count)) 263 break; 264 } 265 } else { 266 if (cpu_exthigh >= 0x80000005) { 267 cpuid_count(0x80000005, 0, p); 268 if (((p[2] >> 24) & 0xff) != 0) { 269 caches[0].id_shift = 0; 270 caches[0].present = 1; 271 } 272 } 273 if (cpu_exthigh >= 0x80000006) { 274 cpuid_count(0x80000006, 0, p); 275 if (((p[2] >> 16) & 0xffff) != 0) { 276 caches[1].id_shift = 0; 277 caches[1].present = 1; 278 } 279 if (((p[3] >> 18) & 0x3fff) != 0) { 280 281 /* 282 * TODO: Account for dual-node processors 283 * where each node within a package has its own 284 * L3 cache. 285 */ 286 caches[2].id_shift = pkg_id_shift; 287 caches[2].present = 1; 288 } 289 } 290 } 291} 292 293/* 294 * Determine topology of processing units for Intel CPUs 295 * using CPUID Leaf 1 and Leaf 4, if supported. 296 * See: 297 * - Intel 64 Architecture Processor Topology Enumeration 298 * - Intel 64 and IA-32 ArchitecturesSoftware Developer���s Manual, 299 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 300 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 301 */ 302static void 303topo_probe_intel_0x4(void) 304{ 305 u_int p[4]; 306 int max_cores; 307 int max_logical; 308 309 /* Both zero and one here mean one logical processor per package. */ 310 max_logical = (cpu_feature & CPUID_HTT) != 0 ? 311 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; 312 if (max_logical <= 1) 313 return; 314 315 if (cpu_high >= 0x4) { 316 cpuid_count(0x04, 0, p); 317 max_cores = ((p[0] >> 26) & 0x3f) + 1; 318 } else 319 max_cores = 1; 320 321 core_id_shift = mask_width(max_logical/max_cores); 322 KASSERT(core_id_shift >= 0, 323 ("intel topo: max_cores > max_logical\n")); 324 pkg_id_shift = core_id_shift + mask_width(max_cores); 325} 326 327/* 328 * Determine topology of processing units for Intel CPUs 329 * using CPUID Leaf 11, if supported. 330 * See: 331 * - Intel 64 Architecture Processor Topology Enumeration 332 * - Intel 64 and IA-32 ArchitecturesSoftware Developer���s Manual, 333 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 334 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 335 */ 336static void 337topo_probe_intel_0xb(void) 338{ 339 u_int p[4]; 340 int bits; 341 int type; 342 int i; 343 344 /* Fall back if CPU leaf 11 doesn't really exist. */ 345 cpuid_count(0x0b, 0, p); 346 if (p[1] == 0) { 347 topo_probe_intel_0x4(); 348 return; 349 } 350 351 /* We only support three levels for now. */ 352 for (i = 0; ; i++) { 353 cpuid_count(0x0b, i, p); 354 355 bits = p[0] & 0x1f; 356 type = (p[2] >> 8) & 0xff; 357 358 if (type == 0) 359 break; 360 361 /* TODO: check for duplicate (re-)assignment */ 362 if (type == CPUID_TYPE_SMT) 363 core_id_shift = bits; 364 else if (type == CPUID_TYPE_CORE) 365 pkg_id_shift = bits; 366 else 367 printf("unknown CPU level type %d\n", type); 368 } 369 370 if (pkg_id_shift < core_id_shift) { 371 printf("WARNING: core covers more APIC IDs than a package\n"); 372 core_id_shift = pkg_id_shift; 373 } 374} 375 376/* 377 * Determine topology of caches for Intel CPUs. 378 * See: 379 * - Intel 64 Architecture Processor Topology Enumeration 380 * - Intel 64 and IA-32 Architectures Software Developer���s Manual 381 * Volume 2A: Instruction Set Reference, A-M, 382 * CPUID instruction 383 */ 384static void 385topo_probe_intel_caches(void) 386{ 387 u_int p[4]; 388 int level; 389 int share_count; 390 int type; 391 int i; 392 393 if (cpu_high < 0x4) { 394 /* 395 * Available cache level and sizes can be determined 396 * via CPUID leaf 2, but that requires a huge table of hardcoded 397 * values, so for now just assume L1 and L2 caches potentially 398 * shared only by HTT processing units, if HTT is present. 399 */ 400 caches[0].id_shift = pkg_id_shift; 401 caches[0].present = 1; 402 caches[1].id_shift = pkg_id_shift; 403 caches[1].present = 1; 404 return; 405 } 406 407 for (i = 0; ; i++) { 408 cpuid_count(0x4, i, p); 409 type = p[0] & 0x1f; 410 level = (p[0] >> 5) & 0x7; 411 share_count = 1 + ((p[0] >> 14) & 0xfff); 412 413 if (!add_deterministic_cache(type, level, share_count)) 414 break; 415 } 416} 417 418/* 419 * Determine topology of processing units and caches for Intel CPUs. 420 * See: 421 * - Intel 64 Architecture Processor Topology Enumeration 422 */ 423static void 424topo_probe_intel(void) 425{ 426 427 /* 428 * Note that 0x1 <= cpu_high < 4 case should be 429 * compatible with topo_probe_intel_0x4() logic when 430 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) 431 * or it should trigger the fallback otherwise. 432 */ 433 if (cpu_high >= 0xb) 434 topo_probe_intel_0xb(); 435 else if (cpu_high >= 0x1) 436 topo_probe_intel_0x4(); 437 438 topo_probe_intel_caches(); 439} 440 441/* 442 * Topology information is queried only on BSP, on which this 443 * code runs and for which it can query CPUID information. 444 * Then topology is extrapolated on all packages using an 445 * assumption that APIC ID to hardware component ID mapping is 446 * homogenious. 447 * That doesn't necesserily imply that the topology is uniform. 448 */ 449void 450topo_probe(void) 451{ 452 static int cpu_topo_probed = 0; 453 struct x86_topo_layer { 454 int type; 455 int subtype; 456 int id_shift; 457 } topo_layers[MAX_CACHE_LEVELS + 3]; 458 struct topo_node *parent; 459 struct topo_node *node; 460 int layer; 461 int nlayers; 462 int node_id; 463 int i; 464 465 if (cpu_topo_probed) 466 return; 467 468 CPU_ZERO(&logical_cpus_mask); 469 470 if (mp_ncpus <= 1) 471 ; /* nothing */ 472 else if (cpu_vendor_id == CPU_VENDOR_AMD) 473 topo_probe_amd(); 474 else if (cpu_vendor_id == CPU_VENDOR_INTEL) 475 topo_probe_intel(); 476 477 KASSERT(pkg_id_shift >= core_id_shift, 478 ("bug in APIC topology discovery")); 479 480 nlayers = 0; 481 bzero(topo_layers, sizeof(topo_layers)); 482 483 topo_layers[nlayers].type = TOPO_TYPE_PKG; 484 topo_layers[nlayers].id_shift = pkg_id_shift; 485 if (bootverbose) 486 printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift); 487 nlayers++; 488 489 /* 490 * Consider all caches to be within a package/chip 491 * and "in front" of all sub-components like 492 * cores and hardware threads. 493 */ 494 for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) { 495 if (caches[i].present) { 496 KASSERT(caches[i].id_shift <= pkg_id_shift, 497 ("bug in APIC topology discovery")); 498 KASSERT(caches[i].id_shift >= core_id_shift, 499 ("bug in APIC topology discovery")); 500 501 topo_layers[nlayers].type = TOPO_TYPE_CACHE; 502 topo_layers[nlayers].subtype = i + 1; 503 topo_layers[nlayers].id_shift = caches[i].id_shift; 504 if (bootverbose) 505 printf("L%u cache ID shift: %u\n", 506 topo_layers[nlayers].subtype, 507 topo_layers[nlayers].id_shift); 508 nlayers++; 509 } 510 } 511 512 if (pkg_id_shift > core_id_shift) { 513 topo_layers[nlayers].type = TOPO_TYPE_CORE; 514 topo_layers[nlayers].id_shift = core_id_shift; 515 if (bootverbose) 516 printf("Core ID shift: %u\n", 517 topo_layers[nlayers].id_shift); 518 nlayers++; 519 } 520 521 topo_layers[nlayers].type = TOPO_TYPE_PU; 522 topo_layers[nlayers].id_shift = 0; 523 nlayers++; 524 525 topo_init_root(&topo_root); 526 for (i = 0; i <= MAX_APIC_ID; ++i) { 527 if (!cpu_info[i].cpu_present) 528 continue; 529 530 parent = &topo_root; 531 for (layer = 0; layer < nlayers; ++layer) { 532 node_id = i >> topo_layers[layer].id_shift; 533 parent = topo_add_node_by_hwid(parent, node_id, 534 topo_layers[layer].type, 535 topo_layers[layer].subtype); 536 } 537 } 538 539 parent = &topo_root; 540 for (layer = 0; layer < nlayers; ++layer) { 541 node_id = boot_cpu_id >> topo_layers[layer].id_shift; 542 node = topo_find_node_by_hwid(parent, node_id, 543 topo_layers[layer].type, 544 topo_layers[layer].subtype); 545 topo_promote_child(node); 546 parent = node; 547 } 548 549 cpu_topo_probed = 1; 550} 551 552/* 553 * Assign logical CPU IDs to local APICs. 554 */ 555void 556assign_cpu_ids(void) 557{ 558 struct topo_node *node; 559 u_int smt_mask; 560 561 smt_mask = (1u << core_id_shift) - 1; 562 563 /* 564 * Assign CPU IDs to local APIC IDs and disable any CPUs 565 * beyond MAXCPU. CPU 0 is always assigned to the BSP. 566 */ 567 mp_ncpus = 0; 568 TOPO_FOREACH(node, &topo_root) { 569 if (node->type != TOPO_TYPE_PU) 570 continue; 571 572 if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask)) 573 cpu_info[node->hwid].cpu_hyperthread = 1; 574 575 if (resource_disabled("lapic", node->hwid)) { 576 if (node->hwid != boot_cpu_id) 577 cpu_info[node->hwid].cpu_disabled = 1; 578 else 579 printf("Cannot disable BSP, APIC ID = %d\n", 580 node->hwid); 581 } 582 583 if (!hyperthreading_allowed && 584 cpu_info[node->hwid].cpu_hyperthread) 585 cpu_info[node->hwid].cpu_disabled = 1; 586 587 if (mp_ncpus >= MAXCPU) 588 cpu_info[node->hwid].cpu_disabled = 1; 589 590 if (cpu_info[node->hwid].cpu_disabled) { 591 disabled_cpus++; 592 continue; 593 } 594 595 cpu_apic_ids[mp_ncpus] = node->hwid; 596 apic_cpuids[node->hwid] = mp_ncpus; 597 topo_set_pu_id(node, mp_ncpus); 598 mp_ncpus++; 599 } 600 601 KASSERT(mp_maxid >= mp_ncpus - 1, 602 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 603 mp_ncpus)); 604} 605 606/* 607 * Print various information about the SMP system hardware and setup. 608 */ 609void 610cpu_mp_announce(void) 611{ 612 struct topo_node *node; 613 const char *hyperthread; 614 int pkg_count; 615 int cores_per_pkg; 616 int thrs_per_core; 617 618 printf("FreeBSD/SMP: "); 619 if (topo_analyze(&topo_root, 1, &pkg_count, 620 &cores_per_pkg, &thrs_per_core)) { 621 printf("%d package(s)", pkg_count); 622 if (cores_per_pkg > 0) 623 printf(" x %d core(s)", cores_per_pkg); 624 if (thrs_per_core > 1) 625 printf(" x %d hardware threads", thrs_per_core); 626 } else { 627 printf("Non-uniform topology"); 628 } 629 printf("\n"); 630 631 if (disabled_cpus) { 632 printf("FreeBSD/SMP Online: "); 633 if (topo_analyze(&topo_root, 0, &pkg_count, 634 &cores_per_pkg, &thrs_per_core)) { 635 printf("%d package(s)", pkg_count); 636 if (cores_per_pkg > 0) 637 printf(" x %d core(s)", cores_per_pkg); 638 if (thrs_per_core > 1) 639 printf(" x %d hardware threads", thrs_per_core); 640 } else { 641 printf("Non-uniform topology"); 642 } 643 printf("\n"); 644 } 645 646 if (!bootverbose) 647 return; 648 649 TOPO_FOREACH(node, &topo_root) { 650 switch (node->type) { 651 case TOPO_TYPE_PKG: 652 printf("Package HW ID = %u (%#x)\n", 653 node->hwid, node->hwid); 654 break; 655 case TOPO_TYPE_CORE: 656 printf("\tCore HW ID = %u (%#x)\n", 657 node->hwid, node->hwid); 658 break; 659 case TOPO_TYPE_PU: 660 if (cpu_info[node->hwid].cpu_hyperthread) 661 hyperthread = "/HT"; 662 else 663 hyperthread = ""; 664 665 if (node->subtype == 0) 666 printf("\t\tCPU (AP%s): APIC ID: %u (%#x)" 667 "(disabled)\n", hyperthread, node->hwid, 668 node->hwid); 669 else if (node->id == 0) 670 printf("\t\tCPU0 (BSP): APIC ID: %u (%#x)\n", 671 node->hwid, node->hwid); 672 else 673 printf("\t\tCPU%u (AP%s): APIC ID: %u (%#x)\n", 674 node->id, hyperthread, node->hwid, 675 node->hwid); 676 break; 677 default: 678 /* ignored */ 679 break; 680 } 681 } 682} 683 684/* 685 * Add a scheduling group, a group of logical processors sharing 686 * a particular cache (and, thus having an affinity), to the scheduling 687 * topology. 688 * This function recursively works on lower level caches. 689 */ 690static void 691x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root) 692{ 693 struct topo_node *node; 694 int nchildren; 695 int ncores; 696 int i; 697 698 KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE, 699 ("x86topo_add_sched_group: bad type: %u", root->type)); 700 CPU_COPY(&root->cpuset, &cg_root->cg_mask); 701 cg_root->cg_count = root->cpu_count; 702 if (root->type == TOPO_TYPE_SYSTEM) 703 cg_root->cg_level = CG_SHARE_NONE; 704 else 705 cg_root->cg_level = root->subtype; 706 707 /* 708 * Check how many core nodes we have under the given root node. 709 * If we have multiple logical processors, but not multiple 710 * cores, then those processors must be hardware threads. 711 */ 712 ncores = 0; 713 node = root; 714 while (node != NULL) { 715 if (node->type != TOPO_TYPE_CORE) { 716 node = topo_next_node(root, node); 717 continue; 718 } 719 720 ncores++; 721 node = topo_next_nonchild_node(root, node); 722 } 723 724 if (cg_root->cg_level != CG_SHARE_NONE && 725 root->cpu_count > 1 && ncores < 2) 726 cg_root->cg_flags = CG_FLAG_SMT; 727 728 /* 729 * Find out how many cache nodes we have under the given root node. 730 * We ignore cache nodes that cover all the same processors as the 731 * root node. Also, we do not descend below found cache nodes. 732 * That is, we count top-level "non-redundant" caches under the root 733 * node. 734 */ 735 nchildren = 0; 736 node = root; 737 while (node != NULL) { 738 if (node->type != TOPO_TYPE_CACHE || 739 (root->type != TOPO_TYPE_SYSTEM && 740 CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { 741 node = topo_next_node(root, node); 742 continue; 743 } 744 nchildren++; 745 node = topo_next_nonchild_node(root, node); 746 } 747 748 cg_root->cg_child = smp_topo_alloc(nchildren); 749 cg_root->cg_children = nchildren; 750 751 /* 752 * Now find again the same cache nodes as above and recursively 753 * build scheduling topologies for them. 754 */ 755 node = root; 756 i = 0; 757 while (node != NULL) { 758 if (node->type != TOPO_TYPE_CACHE || 759 (root->type != TOPO_TYPE_SYSTEM && 760 CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { 761 node = topo_next_node(root, node); 762 continue; 763 } 764 cg_root->cg_child[i].cg_parent = cg_root; 765 x86topo_add_sched_group(node, &cg_root->cg_child[i]); 766 i++; 767 node = topo_next_nonchild_node(root, node); 768 } 769} 770 771/* 772 * Build the MI scheduling topology from the discovered hardware topology. 773 */ 774struct cpu_group * 775cpu_topo(void) 776{ 777 struct cpu_group *cg_root; 778 779 if (mp_ncpus <= 1) 780 return (smp_topo_none()); 781 782 cg_root = smp_topo_alloc(1); 783 x86topo_add_sched_group(&topo_root, cg_root); 784 return (cg_root); 785} 786 787 788/* 789 * Add a logical CPU to the topology. 790 */ 791void 792cpu_add(u_int apic_id, char boot_cpu) 793{ 794 795 if (apic_id > MAX_APIC_ID) { 796 panic("SMP: APIC ID %d too high", apic_id); 797 return; 798 } 799 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", 800 apic_id)); 801 cpu_info[apic_id].cpu_present = 1; 802 if (boot_cpu) { 803 KASSERT(boot_cpu_id == -1, 804 ("CPU %d claims to be BSP, but CPU %d already is", apic_id, 805 boot_cpu_id)); 806 boot_cpu_id = apic_id; 807 cpu_info[apic_id].cpu_bsp = 1; 808 } 809 if (mp_ncpus < MAXCPU) { 810 mp_ncpus++; 811 mp_maxid = mp_ncpus - 1; 812 } 813 if (bootverbose) 814 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : 815 "AP"); 816} 817 818void 819cpu_mp_setmaxid(void) 820{ 821 822 /* 823 * mp_ncpus and mp_maxid should be already set by calls to cpu_add(). 824 * If there were no calls to cpu_add() assume this is a UP system. 825 */ 826 if (mp_ncpus == 0) 827 mp_ncpus = 1; 828} 829 830int 831cpu_mp_probe(void) 832{ 833 834 /* 835 * Always record BSP in CPU map so that the mbuf init code works 836 * correctly. 837 */ 838 CPU_SETOF(0, &all_cpus); 839 return (mp_ncpus > 1); 840} 841 842/* 843 * AP CPU's call this to initialize themselves. 844 */ 845void 846init_secondary_tail(void) 847{ 848 u_int cpuid; 849 850 /* 851 * On real hardware, switch to x2apic mode if possible. Do it 852 * after aps_ready was signalled, to avoid manipulating the 853 * mode while BSP might still want to send some IPI to us 854 * (second startup IPI is ignored on modern hardware etc). 855 */ 856 lapic_xapic_mode(); 857 858 /* Initialize the PAT MSR. */ 859 pmap_init_pat(); 860 861 /* set up CPU registers and state */ 862 cpu_setregs(); 863 864 /* set up SSE/NX */ 865 initializecpu(); 866 867 /* set up FPU state on the AP */ 868#ifdef __amd64__ 869 fpuinit(); 870#else 871 npxinit(false); 872#endif 873 874 if (cpu_ops.cpu_init) 875 cpu_ops.cpu_init(); 876 877 /* A quick check from sanity claus */ 878 cpuid = PCPU_GET(cpuid); 879 if (PCPU_GET(apic_id) != lapic_id()) { 880 printf("SMP: cpuid = %d\n", cpuid); 881 printf("SMP: actual apic_id = %d\n", lapic_id()); 882 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 883 panic("cpuid mismatch! boom!!"); 884 } 885 886 /* Initialize curthread. */ 887 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 888 PCPU_SET(curthread, PCPU_GET(idlethread)); 889 890 mca_init(); 891 892 mtx_lock_spin(&ap_boot_mtx); 893 894 /* Init local apic for irq's */ 895 lapic_setup(1); 896 897 /* Set memory range attributes for this CPU to match the BSP */ 898 mem_range_AP_init(); 899 900 smp_cpus++; 901 902 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); 903 printf("SMP: AP CPU #%d Launched!\n", cpuid); 904 905 /* Determine if we are a logical CPU. */ 906 if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread) 907 CPU_SET(cpuid, &logical_cpus_mask); 908 909 if (bootverbose) 910 lapic_dump("AP"); 911 912 if (smp_cpus == mp_ncpus) { 913 /* enable IPI's, tlb shootdown, freezes etc */ 914 atomic_store_rel_int(&smp_started, 1); 915 } 916 917#ifdef __amd64__ 918 /* 919 * Enable global pages TLB extension 920 * This also implicitly flushes the TLB 921 */ 922 load_cr4(rcr4() | CR4_PGE); 923 if (pmap_pcid_enabled) 924 load_cr4(rcr4() | CR4_PCIDE); 925 load_ds(_udatasel); 926 load_es(_udatasel); 927 load_fs(_ufssel); 928#endif 929 930 mtx_unlock_spin(&ap_boot_mtx); 931 932 /* Wait until all the AP's are up. */ 933 while (atomic_load_acq_int(&smp_started) == 0) 934 ia32_pause(); 935 936 /* Start per-CPU event timers. */ 937 cpu_initclocks_ap(); 938 939 sched_throw(NULL); 940 941 panic("scheduler returned us to %s", __func__); 942 /* NOTREACHED */ 943} 944 945/******************************************************************* 946 * local functions and data 947 */ 948 949/* 950 * We tell the I/O APIC code about all the CPUs we want to receive 951 * interrupts. If we don't want certain CPUs to receive IRQs we 952 * can simply not tell the I/O APIC code about them in this function. 953 * We also do not tell it about the BSP since it tells itself about 954 * the BSP internally to work with UP kernels and on UP machines. 955 */ 956void 957set_interrupt_apic_ids(void) 958{ 959 u_int i, apic_id; 960 961 for (i = 0; i < MAXCPU; i++) { 962 apic_id = cpu_apic_ids[i]; 963 if (apic_id == -1) 964 continue; 965 if (cpu_info[apic_id].cpu_bsp) 966 continue; 967 if (cpu_info[apic_id].cpu_disabled) 968 continue; 969 970 /* Don't let hyperthreads service interrupts. */ 971 if (cpu_info[apic_id].cpu_hyperthread) 972 continue; 973 974 intr_add_cpu(i); 975 } 976} 977 978 979#ifdef COUNT_XINVLTLB_HITS 980u_int xhits_gbl[MAXCPU]; 981u_int xhits_pg[MAXCPU]; 982u_int xhits_rng[MAXCPU]; 983static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); 984SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, 985 sizeof(xhits_gbl), "IU", ""); 986SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, 987 sizeof(xhits_pg), "IU", ""); 988SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, 989 sizeof(xhits_rng), "IU", ""); 990 991u_int ipi_global; 992u_int ipi_page; 993u_int ipi_range; 994u_int ipi_range_size; 995SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); 996SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); 997SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); 998SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 999 0, ""); 1000#endif /* COUNT_XINVLTLB_HITS */ 1001 1002/* 1003 * Init and startup IPI. 1004 */ 1005void 1006ipi_startup(int apic_id, int vector) 1007{ 1008 1009 /* 1010 * This attempts to follow the algorithm described in the 1011 * Intel Multiprocessor Specification v1.4 in section B.4. 1012 * For each IPI, we allow the local APIC ~20us to deliver the 1013 * IPI. If that times out, we panic. 1014 */ 1015 1016 /* 1017 * first we do an INIT IPI: this INIT IPI might be run, resetting 1018 * and running the target CPU. OR this INIT IPI might be latched (P5 1019 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 1020 * ignored. 1021 */ 1022 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1023 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 1024 lapic_ipi_wait(100); 1025 1026 /* Explicitly deassert the INIT IPI. */ 1027 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1028 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 1029 apic_id); 1030 1031 DELAY(10000); /* wait ~10mS */ 1032 1033 /* 1034 * next we do a STARTUP IPI: the previous INIT IPI might still be 1035 * latched, (P5 bug) this 1st STARTUP would then terminate 1036 * immediately, and the previously started INIT IPI would continue. OR 1037 * the previous INIT IPI has already run. and this STARTUP IPI will 1038 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 1039 * will run. 1040 */ 1041 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1042 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1043 vector, apic_id); 1044 if (!lapic_ipi_wait(100)) 1045 panic("Failed to deliver first STARTUP IPI to APIC %d", 1046 apic_id); 1047 DELAY(200); /* wait ~200uS */ 1048 1049 /* 1050 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 1051 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 1052 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 1053 * recognized after hardware RESET or INIT IPI. 1054 */ 1055 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1056 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1057 vector, apic_id); 1058 if (!lapic_ipi_wait(100)) 1059 panic("Failed to deliver second STARTUP IPI to APIC %d", 1060 apic_id); 1061 1062 DELAY(200); /* wait ~200uS */ 1063} 1064 1065/* 1066 * Send an IPI to specified CPU handling the bitmap logic. 1067 */ 1068void 1069ipi_send_cpu(int cpu, u_int ipi) 1070{ 1071 u_int bitmap, old_pending, new_pending; 1072 1073 KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); 1074 1075 if (IPI_IS_BITMAPED(ipi)) { 1076 bitmap = 1 << ipi; 1077 ipi = IPI_BITMAP_VECTOR; 1078 do { 1079 old_pending = cpu_ipi_pending[cpu]; 1080 new_pending = old_pending | bitmap; 1081 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], 1082 old_pending, new_pending)); 1083 if (old_pending) 1084 return; 1085 } 1086 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); 1087} 1088 1089void 1090ipi_bitmap_handler(struct trapframe frame) 1091{ 1092 struct trapframe *oldframe; 1093 struct thread *td; 1094 int cpu = PCPU_GET(cpuid); 1095 u_int ipi_bitmap; 1096 1097 critical_enter(); 1098 td = curthread; 1099 td->td_intr_nesting_level++; 1100 oldframe = td->td_intr_frame; 1101 td->td_intr_frame = &frame; 1102 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 1103 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 1104#ifdef COUNT_IPIS 1105 (*ipi_preempt_counts[cpu])++; 1106#endif 1107 sched_preempt(td); 1108 } 1109 if (ipi_bitmap & (1 << IPI_AST)) { 1110#ifdef COUNT_IPIS 1111 (*ipi_ast_counts[cpu])++; 1112#endif 1113 /* Nothing to do for AST */ 1114 } 1115 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { 1116#ifdef COUNT_IPIS 1117 (*ipi_hardclock_counts[cpu])++; 1118#endif 1119 hardclockintr(); 1120 } 1121 td->td_intr_frame = oldframe; 1122 td->td_intr_nesting_level--; 1123 critical_exit(); 1124} 1125 1126/* 1127 * send an IPI to a set of cpus. 1128 */ 1129void 1130ipi_selected(cpuset_t cpus, u_int ipi) 1131{ 1132 int cpu; 1133 1134 /* 1135 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1136 * of help in order to understand what is the source. 1137 * Set the mask of receiving CPUs for this purpose. 1138 */ 1139 if (ipi == IPI_STOP_HARD) 1140 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus); 1141 1142 while ((cpu = CPU_FFS(&cpus)) != 0) { 1143 cpu--; 1144 CPU_CLR(cpu, &cpus); 1145 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1146 ipi_send_cpu(cpu, ipi); 1147 } 1148} 1149 1150/* 1151 * send an IPI to a specific CPU. 1152 */ 1153void 1154ipi_cpu(int cpu, u_int ipi) 1155{ 1156 1157 /* 1158 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1159 * of help in order to understand what is the source. 1160 * Set the mask of receiving CPUs for this purpose. 1161 */ 1162 if (ipi == IPI_STOP_HARD) 1163 CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending); 1164 1165 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1166 ipi_send_cpu(cpu, ipi); 1167} 1168 1169/* 1170 * send an IPI to all CPUs EXCEPT myself 1171 */ 1172void 1173ipi_all_but_self(u_int ipi) 1174{ 1175 cpuset_t other_cpus; 1176 1177 other_cpus = all_cpus; 1178 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1179 if (IPI_IS_BITMAPED(ipi)) { 1180 ipi_selected(other_cpus, ipi); 1181 return; 1182 } 1183 1184 /* 1185 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1186 * of help in order to understand what is the source. 1187 * Set the mask of receiving CPUs for this purpose. 1188 */ 1189 if (ipi == IPI_STOP_HARD) 1190 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus); 1191 1192 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1193 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 1194} 1195 1196int 1197ipi_nmi_handler(void) 1198{ 1199 u_int cpuid; 1200 1201 /* 1202 * As long as there is not a simple way to know about a NMI's 1203 * source, if the bitmask for the current CPU is present in 1204 * the global pending bitword an IPI_STOP_HARD has been issued 1205 * and should be handled. 1206 */ 1207 cpuid = PCPU_GET(cpuid); 1208 if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending)) 1209 return (1); 1210 1211 CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending); 1212 cpustop_handler(); 1213 return (0); 1214} 1215 1216/* 1217 * Handle an IPI_STOP by saving our current context and spinning until we 1218 * are resumed. 1219 */ 1220void 1221cpustop_handler(void) 1222{ 1223 u_int cpu; 1224 1225 cpu = PCPU_GET(cpuid); 1226 1227 savectx(&stoppcbs[cpu]); 1228 1229 /* Indicate that we are stopped */ 1230 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1231 1232 /* Wait for restart */ 1233 while (!CPU_ISSET(cpu, &started_cpus)) 1234 ia32_pause(); 1235 1236 CPU_CLR_ATOMIC(cpu, &started_cpus); 1237 CPU_CLR_ATOMIC(cpu, &stopped_cpus); 1238 1239#if defined(__amd64__) && defined(DDB) 1240 amd64_db_resume_dbreg(); 1241#endif 1242 1243 if (cpu == 0 && cpustop_restartfunc != NULL) { 1244 cpustop_restartfunc(); 1245 cpustop_restartfunc = NULL; 1246 } 1247} 1248 1249/* 1250 * Handle an IPI_SUSPEND by saving our current context and spinning until we 1251 * are resumed. 1252 */ 1253void 1254cpususpend_handler(void) 1255{ 1256 u_int cpu; 1257 1258 mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); 1259 1260 cpu = PCPU_GET(cpuid); 1261 if (savectx(&susppcbs[cpu]->sp_pcb)) { 1262#ifdef __amd64__ 1263 fpususpend(susppcbs[cpu]->sp_fpususpend); 1264#else 1265 npxsuspend(susppcbs[cpu]->sp_fpususpend); 1266#endif 1267 wbinvd(); 1268 CPU_SET_ATOMIC(cpu, &suspended_cpus); 1269 } else { 1270#ifdef __amd64__ 1271 fpuresume(susppcbs[cpu]->sp_fpususpend); 1272#else 1273 npxresume(susppcbs[cpu]->sp_fpususpend); 1274#endif 1275 pmap_init_pat(); 1276 initializecpu(); 1277 PCPU_SET(switchtime, 0); 1278 PCPU_SET(switchticks, ticks); 1279 1280 /* Indicate that we are resumed */ 1281 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1282 } 1283 1284 /* Wait for resume */ 1285 while (!CPU_ISSET(cpu, &started_cpus)) 1286 ia32_pause(); 1287 1288 if (cpu_ops.cpu_resume) 1289 cpu_ops.cpu_resume(); 1290#ifdef __amd64__ 1291 if (vmm_resume_p) 1292 vmm_resume_p(); 1293#endif 1294 1295 /* Resume MCA and local APIC */ 1296 lapic_xapic_mode(); 1297 mca_resume(); 1298 lapic_setup(0); 1299 1300 /* Indicate that we are resumed */ 1301 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1302 CPU_CLR_ATOMIC(cpu, &started_cpus); 1303} 1304 1305 1306void 1307invlcache_handler(void) 1308{ 1309#ifdef COUNT_IPIS 1310 (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; 1311#endif /* COUNT_IPIS */ 1312 1313 wbinvd(); 1314 atomic_add_int(&smp_tlb_wait, 1); 1315} 1316 1317/* 1318 * This is called once the rest of the system is up and running and we're 1319 * ready to let the AP's out of the pen. 1320 */ 1321static void 1322release_aps(void *dummy __unused) 1323{ 1324 1325 if (mp_ncpus == 1) 1326 return; 1327 atomic_store_rel_int(&aps_ready, 1); 1328 while (smp_started == 0) 1329 ia32_pause(); 1330} 1331SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1332 1333#ifdef COUNT_IPIS 1334/* 1335 * Setup interrupt counters for IPI handlers. 1336 */ 1337static void 1338mp_ipi_intrcnt(void *dummy) 1339{ 1340 char buf[64]; 1341 int i; 1342 1343 CPU_FOREACH(i) { 1344 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); 1345 intrcnt_add(buf, &ipi_invltlb_counts[i]); 1346 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); 1347 intrcnt_add(buf, &ipi_invlrng_counts[i]); 1348 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); 1349 intrcnt_add(buf, &ipi_invlpg_counts[i]); 1350 snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); 1351 intrcnt_add(buf, &ipi_invlcache_counts[i]); 1352 snprintf(buf, sizeof(buf), "cpu%d:preempt", i); 1353 intrcnt_add(buf, &ipi_preempt_counts[i]); 1354 snprintf(buf, sizeof(buf), "cpu%d:ast", i); 1355 intrcnt_add(buf, &ipi_ast_counts[i]); 1356 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); 1357 intrcnt_add(buf, &ipi_rendezvous_counts[i]); 1358 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); 1359 intrcnt_add(buf, &ipi_hardclock_counts[i]); 1360 } 1361} 1362SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); 1363#endif 1364 1365/* 1366 * Flush the TLB on other CPU's 1367 */ 1368 1369/* Variables needed for SMP tlb shootdown. */ 1370static vm_offset_t smp_tlb_addr1, smp_tlb_addr2; 1371pmap_t smp_tlb_pmap; 1372volatile int smp_tlb_wait; 1373 1374#ifdef __amd64__ 1375#define read_eflags() read_rflags() 1376#endif 1377 1378static void 1379smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, 1380 vm_offset_t addr1, vm_offset_t addr2) 1381{ 1382 int cpu, ncpu, othercpus; 1383 1384 othercpus = mp_ncpus - 1; /* does not shootdown self */ 1385 1386 /* 1387 * Check for other cpus. Return if none. 1388 */ 1389 if (CPU_ISFULLSET(&mask)) { 1390 if (othercpus < 1) 1391 return; 1392 } else { 1393 CPU_CLR(PCPU_GET(cpuid), &mask); 1394 if (CPU_EMPTY(&mask)) 1395 return; 1396 } 1397 1398 if (!(read_eflags() & PSL_I)) 1399 panic("%s: interrupts disabled", __func__); 1400 mtx_lock_spin(&smp_ipi_mtx); 1401 smp_tlb_addr1 = addr1; 1402 smp_tlb_addr2 = addr2; 1403 smp_tlb_pmap = pmap; 1404 smp_tlb_wait = 0; 1405 if (CPU_ISFULLSET(&mask)) { 1406 ncpu = othercpus; 1407 ipi_all_but_self(vector); 1408 } else { 1409 ncpu = 0; 1410 while ((cpu = CPU_FFS(&mask)) != 0) { 1411 cpu--; 1412 CPU_CLR(cpu, &mask); 1413 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, 1414 cpu, vector); 1415 ipi_send_cpu(cpu, vector); 1416 ncpu++; 1417 } 1418 } 1419 while (smp_tlb_wait < ncpu) 1420 ia32_pause(); 1421 mtx_unlock_spin(&smp_ipi_mtx); 1422} 1423 1424void 1425smp_masked_invltlb(cpuset_t mask, pmap_t pmap) 1426{ 1427 1428 if (smp_started) { 1429 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0); 1430#ifdef COUNT_XINVLTLB_HITS 1431 ipi_global++; 1432#endif 1433 } 1434} 1435 1436void 1437smp_masked_invlpg(cpuset_t mask, vm_offset_t addr) 1438{ 1439 1440 if (smp_started) { 1441 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, NULL, addr, 0); 1442#ifdef COUNT_XINVLTLB_HITS 1443 ipi_page++; 1444#endif 1445 } 1446} 1447 1448void 1449smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2) 1450{ 1451 1452 if (smp_started) { 1453 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, NULL, 1454 addr1, addr2); 1455#ifdef COUNT_XINVLTLB_HITS 1456 ipi_range++; 1457 ipi_range_size += (addr2 - addr1) / PAGE_SIZE; 1458#endif 1459 } 1460} 1461 1462void 1463smp_cache_flush(void) 1464{ 1465 1466 if (smp_started) { 1467 smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL, 1468 0, 0); 1469 } 1470} 1471 1472/* 1473 * Handlers for TLB related IPIs 1474 */ 1475void 1476invltlb_handler(void) 1477{ 1478#ifdef COUNT_XINVLTLB_HITS 1479 xhits_gbl[PCPU_GET(cpuid)]++; 1480#endif /* COUNT_XINVLTLB_HITS */ 1481#ifdef COUNT_IPIS 1482 (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; 1483#endif /* COUNT_IPIS */ 1484 1485 if (smp_tlb_pmap == kernel_pmap) 1486 invltlb_glob(); 1487 else 1488 invltlb(); 1489 atomic_add_int(&smp_tlb_wait, 1); 1490} 1491 1492void 1493invlpg_handler(void) 1494{ 1495#ifdef COUNT_XINVLTLB_HITS 1496 xhits_pg[PCPU_GET(cpuid)]++; 1497#endif /* COUNT_XINVLTLB_HITS */ 1498#ifdef COUNT_IPIS 1499 (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; 1500#endif /* COUNT_IPIS */ 1501 1502 invlpg(smp_tlb_addr1); 1503 atomic_add_int(&smp_tlb_wait, 1); 1504} 1505 1506void 1507invlrng_handler(void) 1508{ 1509 vm_offset_t addr; 1510 1511#ifdef COUNT_XINVLTLB_HITS 1512 xhits_rng[PCPU_GET(cpuid)]++; 1513#endif /* COUNT_XINVLTLB_HITS */ 1514#ifdef COUNT_IPIS 1515 (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; 1516#endif /* COUNT_IPIS */ 1517 1518 addr = smp_tlb_addr1; 1519 do { 1520 invlpg(addr); 1521 addr += PAGE_SIZE; 1522 } while (addr < smp_tlb_addr2); 1523 1524 atomic_add_int(&smp_tlb_wait, 1); 1525} 1526