1/*-
2 * Copyright (c) 1996, by Steve Passe
3 * Copyright (c) 2003, by Peter Wemm
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. The name of the developer may NOT be used to endorse or promote products
12 *    derived from this software without specific prior written permission.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28#include "opt_acpi.h"
29#ifdef __i386__
30#include "opt_apic.h"
31#endif
32#include "opt_cpu.h"
33#include "opt_ddb.h"
34#include "opt_gdb.h"
35#include "opt_kstack_pages.h"
36#include "opt_pmap.h"
37#include "opt_sched.h"
38#include "opt_smp.h"
39#include "opt_stack.h"
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/asan.h>
44#include <sys/bus.h>
45#include <sys/cons.h>	/* cngetc() */
46#include <sys/cpuset.h>
47#include <sys/csan.h>
48#include <sys/interrupt.h>
49#include <sys/kdb.h>
50#include <sys/kernel.h>
51#include <sys/ktr.h>
52#include <sys/lock.h>
53#include <sys/malloc.h>
54#include <sys/memrange.h>
55#include <sys/mutex.h>
56#include <sys/pcpu.h>
57#include <sys/proc.h>
58#include <sys/sched.h>
59#include <sys/smp.h>
60#include <sys/sysctl.h>
61
62#include <vm/vm.h>
63#include <vm/vm_param.h>
64#include <vm/pmap.h>
65#include <vm/vm_kern.h>
66#include <vm/vm_extern.h>
67#include <vm/vm_map.h>
68
69#include <x86/apicreg.h>
70#include <machine/clock.h>
71#include <machine/cpu.h>
72#include <machine/cputypes.h>
73#include <x86/mca.h>
74#include <machine/md_var.h>
75#include <machine/pcb.h>
76#include <machine/psl.h>
77#include <machine/smp.h>
78#include <machine/specialreg.h>
79#include <machine/stack.h>
80#include <x86/ucode.h>
81
82#ifdef DEV_ACPI
83#include <contrib/dev/acpica/include/acpi.h>
84#include <dev/acpica/acpivar.h>
85#endif
86
87static MALLOC_DEFINE(M_CPUS, "cpus", "CPU items");
88
89int	mp_naps;		/* # of Applications processors */
90int	boot_cpu_id = -1;	/* designated BSP */
91
92/* AP uses this during bootstrap.  Do not staticize.  */
93char *bootSTK;
94int bootAP;
95
96/* Free these after use */
97void *bootstacks[MAXCPU];
98void *dpcpu;
99
100struct susppcb **susppcbs;
101
102#ifdef COUNT_IPIS
103/* Interrupt counts. */
104static u_long *ipi_preempt_counts[MAXCPU];
105static u_long *ipi_ast_counts[MAXCPU];
106u_long *ipi_invltlb_counts[MAXCPU];
107u_long *ipi_invlrng_counts[MAXCPU];
108u_long *ipi_invlpg_counts[MAXCPU];
109u_long *ipi_invlcache_counts[MAXCPU];
110u_long *ipi_rendezvous_counts[MAXCPU];
111static u_long *ipi_hardclock_counts[MAXCPU];
112#endif
113
114/* Default cpu_ops implementation. */
115struct cpu_ops cpu_ops;
116
117/*
118 * Local data and functions.
119 */
120
121static volatile cpuset_t ipi_stop_nmi_pending;
122
123volatile cpuset_t resuming_cpus;
124volatile cpuset_t toresume_cpus;
125
126/* used to hold the AP's until we are ready to release them */
127struct mtx ap_boot_mtx;
128
129/* Set to 1 once we're ready to let the APs out of the pen. */
130volatile int aps_ready = 0;
131
132/*
133 * Store data from cpu_add() until later in the boot when we actually setup
134 * the APs.
135 */
136struct cpu_info *cpu_info;
137int *apic_cpuids;
138int cpu_apic_ids[MAXCPU];
139_Static_assert(MAXCPU <= MAX_APIC_ID,
140    "MAXCPU cannot be larger that MAX_APIC_ID");
141_Static_assert(xAPIC_MAX_APIC_ID <= MAX_APIC_ID,
142    "xAPIC_MAX_APIC_ID cannot be larger that MAX_APIC_ID");
143
144static void	release_aps(void *dummy);
145static void	cpustop_handler_post(u_int cpu);
146
147static int	hyperthreading_allowed = 1;
148SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN,
149	&hyperthreading_allowed, 0, "Use Intel HTT logical CPUs");
150
151static int	hyperthreading_intr_allowed = 0;
152SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_intr_allowed, CTLFLAG_RDTUN,
153	&hyperthreading_intr_allowed, 0,
154	"Allow interrupts on HTT logical CPUs");
155
156static int	intr_apic_id_limit = -1;
157SYSCTL_INT(_machdep, OID_AUTO, intr_apic_id_limit, CTLFLAG_RDTUN,
158	&intr_apic_id_limit, 0,
159	"Maximum permitted APIC ID for interrupt delivery (-1 is unlimited)");
160
161static struct topo_node topo_root;
162
163static int pkg_id_shift;
164static int node_id_shift;
165static int core_id_shift;
166static int disabled_cpus;
167
168struct cache_info {
169	int	id_shift;
170	int	present;
171} static caches[MAX_CACHE_LEVELS];
172
173static bool stop_mwait = false;
174SYSCTL_BOOL(_machdep, OID_AUTO, stop_mwait, CTLFLAG_RWTUN, &stop_mwait, 0,
175    "Use MONITOR/MWAIT when stopping CPU, if available");
176
177void
178mem_range_AP_init(void)
179{
180
181	if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
182		mem_range_softc.mr_op->initAP(&mem_range_softc);
183}
184
185/*
186 * Compute ceil(log2(x)).  Returns -1 if x is zero.
187 */
188static __inline int
189mask_width(u_int x)
190{
191
192	return (x == 0 ? -1 : fls(x - 1));
193}
194
195/*
196 * Add a cache level to the cache topology description.
197 */
198static int
199add_deterministic_cache(int type, int level, int share_count)
200{
201
202	if (type == 0)
203		return (0);
204	if (type > 3) {
205		printf("unexpected cache type %d\n", type);
206		return (1);
207	}
208	if (type == 2) /* ignore instruction cache */
209		return (1);
210	if (level == 0 || level > MAX_CACHE_LEVELS) {
211		printf("unexpected cache level %d\n", level);
212		return (1);
213	}
214
215	if (caches[level - 1].present) {
216		printf("WARNING: multiple entries for L%u data cache\n", level);
217		printf("%u => %u\n", caches[level - 1].id_shift,
218		    mask_width(share_count));
219	}
220	caches[level - 1].id_shift = mask_width(share_count);
221	caches[level - 1].present = 1;
222
223	if (caches[level - 1].id_shift > pkg_id_shift) {
224		printf("WARNING: L%u data cache covers more "
225		    "APIC IDs than a package (%u > %u)\n", level,
226		    caches[level - 1].id_shift, pkg_id_shift);
227		caches[level - 1].id_shift = pkg_id_shift;
228	}
229	if (caches[level - 1].id_shift < core_id_shift) {
230		printf("WARNING: L%u data cache covers fewer "
231		    "APIC IDs than a core (%u < %u)\n", level,
232		    caches[level - 1].id_shift, core_id_shift);
233		caches[level - 1].id_shift = core_id_shift;
234	}
235
236	return (1);
237}
238
239/*
240 * Determine topology of processing units and caches for AMD CPUs.
241 * See:
242 *  - AMD CPUID Specification (Publication # 25481)
243 *  - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559)
244 *  - BKDG For AMD Family 10h Processors (Publication # 31116)
245 *  - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301)
246 *  - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751)
247 *  - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945)
248 */
249static void
250topo_probe_amd(void)
251{
252	u_int p[4];
253	uint64_t v;
254	int level;
255	int nodes_per_socket;
256	int share_count;
257	int type;
258	int i;
259
260	/* No multi-core capability. */
261	if ((amd_feature2 & AMDID2_CMP) == 0)
262		return;
263
264	/*
265	 * XXX Lack of an AMD IOMMU driver prevents use of APIC IDs above
266	 * xAPIC_MAX_APIC_ID.  This is a workaround so we boot and function on
267	 * AMD systems with high thread counts, albeit with reduced interrupt
268	 * performance.
269	 *
270	 * We should really set the limit to xAPIC_MAX_APIC_ID by default, and
271	 * have the IOMMU driver increase it.  That way if a driver is present
272	 * but disabled, or is otherwise not able to route the interrupts, the
273	 * system can fall back to a functional state.  That will require a more
274	 * substantial change though, including having the IOMMU initialize
275	 * earlier.
276	 */
277	if (intr_apic_id_limit == -1)
278		intr_apic_id_limit = xAPIC_MAX_APIC_ID;
279
280	/* For families 10h and newer. */
281	pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
282	    AMDID_COREID_SIZE_SHIFT;
283
284	/* For 0Fh family. */
285	if (pkg_id_shift == 0)
286		pkg_id_shift =
287		    mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1);
288
289	/*
290	 * Families prior to 16h define the following value as
291	 * cores per compute unit and we don't really care about the AMD
292	 * compute units at the moment.  Perhaps we should treat them as
293	 * cores and cores within the compute units as hardware threads,
294	 * but that's up for debate.
295	 * Later families define the value as threads per compute unit,
296	 * so we are following AMD's nomenclature here.
297	 */
298	if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 &&
299	    CPUID_TO_FAMILY(cpu_id) >= 0x16) {
300		cpuid_count(0x8000001e, 0, p);
301		share_count = ((p[1] >> 8) & 0xff) + 1;
302		core_id_shift = mask_width(share_count);
303
304		/*
305		 * For Zen (17h), gather Nodes per Processor.  Each node is a
306		 * Zeppelin die; TR and EPYC CPUs will have multiple dies per
307		 * package.  Communication latency between dies is higher than
308		 * within them.
309		 */
310		nodes_per_socket = ((p[2] >> 8) & 0x7) + 1;
311		node_id_shift = pkg_id_shift - mask_width(nodes_per_socket);
312	}
313
314	if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) {
315		for (i = 0; ; i++) {
316			cpuid_count(0x8000001d, i, p);
317			type = p[0] & 0x1f;
318			level = (p[0] >> 5) & 0x7;
319			share_count = 1 + ((p[0] >> 14) & 0xfff);
320
321			if (!add_deterministic_cache(type, level, share_count))
322				break;
323		}
324	} else {
325		if (cpu_exthigh >= 0x80000005) {
326			cpuid_count(0x80000005, 0, p);
327			if (((p[2] >> 24) & 0xff) != 0) {
328				caches[0].id_shift = 0;
329				caches[0].present = 1;
330			}
331		}
332		if (cpu_exthigh >= 0x80000006) {
333			cpuid_count(0x80000006, 0, p);
334			if (((p[2] >> 16) & 0xffff) != 0) {
335				caches[1].id_shift = 0;
336				caches[1].present = 1;
337			}
338			if (((p[3] >> 18) & 0x3fff) != 0) {
339				nodes_per_socket = 1;
340				if ((amd_feature2 & AMDID2_NODE_ID) != 0) {
341					/*
342					 * Handle multi-node processors that
343					 * have multiple chips, each with its
344					 * own L3 cache, on the same die.
345					 */
346					v = rdmsr(0xc001100c);
347					nodes_per_socket = 1 + ((v >> 3) & 0x7);
348				}
349				caches[2].id_shift =
350				    pkg_id_shift - mask_width(nodes_per_socket);
351				caches[2].present = 1;
352			}
353		}
354	}
355}
356
357/*
358 * Determine topology of processing units for Intel CPUs
359 * using CPUID Leaf 1 and Leaf 4, if supported.
360 * See:
361 *  - Intel 64 Architecture Processor Topology Enumeration
362 *  - Intel 64 and IA-32 ArchitecturesSoftware Developer���s Manual,
363 *    Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
364 *    FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
365 */
366static void
367topo_probe_intel_0x4(void)
368{
369	u_int p[4];
370	int max_cores;
371	int max_logical;
372
373	/* Both zero and one here mean one logical processor per package. */
374	max_logical = (cpu_feature & CPUID_HTT) != 0 ?
375	    (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
376	if (max_logical <= 1)
377		return;
378
379	if (cpu_high >= 0x4) {
380		cpuid_count(0x04, 0, p);
381		max_cores = ((p[0] >> 26) & 0x3f) + 1;
382	} else
383		max_cores = 1;
384
385	core_id_shift = mask_width(max_logical/max_cores);
386	KASSERT(core_id_shift >= 0,
387	    ("intel topo: max_cores > max_logical\n"));
388	pkg_id_shift = core_id_shift + mask_width(max_cores);
389}
390
391/*
392 * Determine topology of processing units for Intel CPUs
393 * using CPUID Leaf 1Fh or 0Bh, if supported.
394 * See:
395 *  - Intel 64 Architecture Processor Topology Enumeration
396 *  - Intel 64 and IA-32 ArchitecturesSoftware Developer���s Manual,
397 *    Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
398 *    FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
399 */
400static void
401topo_probe_intel_0xb(void)
402{
403	u_int leaf;
404	u_int p[4] = { 0 };
405	int bits;
406	int type;
407	int i;
408
409	/* Prefer leaf 1Fh (V2 Extended Topology Enumeration). */
410	if (cpu_high >= 0x1f) {
411		leaf = 0x1f;
412		cpuid_count(leaf, 0, p);
413	}
414	/* Fall back to leaf 0Bh (Extended Topology Enumeration). */
415	if (p[1] == 0) {
416		leaf = 0x0b;
417		cpuid_count(leaf, 0, p);
418	}
419	/* Fall back to leaf 04h (Deterministic Cache Parameters). */
420	if (p[1] == 0) {
421		topo_probe_intel_0x4();
422		return;
423	}
424
425	/* We only support three levels for now. */
426	for (i = 0; ; i++) {
427		cpuid_count(leaf, i, p);
428
429		bits = p[0] & 0x1f;
430		type = (p[2] >> 8) & 0xff;
431
432		if (type == 0)
433			break;
434
435		if (type == CPUID_TYPE_SMT)
436			core_id_shift = bits;
437		else if (type == CPUID_TYPE_CORE)
438			pkg_id_shift = bits;
439		else if (bootverbose)
440			printf("Topology level type %d shift: %d\n", type, bits);
441	}
442
443	if (pkg_id_shift < core_id_shift) {
444		printf("WARNING: core covers more APIC IDs than a package\n");
445		core_id_shift = pkg_id_shift;
446	}
447}
448
449/*
450 * Determine topology of caches for Intel CPUs.
451 * See:
452 *  - Intel 64 Architecture Processor Topology Enumeration
453 *  - Intel 64 and IA-32 Architectures Software Developer���s Manual
454 *    Volume 2A: Instruction Set Reference, A-M,
455 *    CPUID instruction
456 */
457static void
458topo_probe_intel_caches(void)
459{
460	u_int p[4];
461	int level;
462	int share_count;
463	int type;
464	int i;
465
466	if (cpu_high < 0x4) {
467		/*
468		 * Available cache level and sizes can be determined
469		 * via CPUID leaf 2, but that requires a huge table of hardcoded
470		 * values, so for now just assume L1 and L2 caches potentially
471		 * shared only by HTT processing units, if HTT is present.
472		 */
473		caches[0].id_shift = pkg_id_shift;
474		caches[0].present = 1;
475		caches[1].id_shift = pkg_id_shift;
476		caches[1].present = 1;
477		return;
478	}
479
480	for (i = 0; ; i++) {
481		cpuid_count(0x4, i, p);
482		type = p[0] & 0x1f;
483		level = (p[0] >> 5) & 0x7;
484		share_count = 1 + ((p[0] >> 14) & 0xfff);
485
486		if (!add_deterministic_cache(type, level, share_count))
487			break;
488	}
489}
490
491/*
492 * Determine topology of processing units and caches for Intel CPUs.
493 * See:
494 *  - Intel 64 Architecture Processor Topology Enumeration
495 */
496static void
497topo_probe_intel(void)
498{
499
500	/*
501	 * Note that 0x1 <= cpu_high < 4 case should be
502	 * compatible with topo_probe_intel_0x4() logic when
503	 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
504	 * or it should trigger the fallback otherwise.
505	 */
506	if (cpu_high >= 0xb)
507		topo_probe_intel_0xb();
508	else if (cpu_high >= 0x1)
509		topo_probe_intel_0x4();
510
511	topo_probe_intel_caches();
512}
513
514/*
515 * Topology information is queried only on BSP, on which this
516 * code runs and for which it can query CPUID information.
517 * Then topology is extrapolated on all packages using an
518 * assumption that APIC ID to hardware component ID mapping is
519 * homogenious.
520 * That doesn't necesserily imply that the topology is uniform.
521 */
522void
523topo_probe(void)
524{
525	static int cpu_topo_probed = 0;
526	struct x86_topo_layer {
527		int type;
528		int subtype;
529		int id_shift;
530	} topo_layers[MAX_CACHE_LEVELS + 5];
531	struct topo_node *parent;
532	struct topo_node *node;
533	int layer;
534	int nlayers;
535	int node_id;
536	int i;
537#if defined(DEV_ACPI) && MAXMEMDOM > 1
538	int d, domain;
539#endif
540
541	if (cpu_topo_probed)
542		return;
543
544	CPU_ZERO(&logical_cpus_mask);
545
546	if (mp_ncpus <= 1)
547		; /* nothing */
548	else if (cpu_vendor_id == CPU_VENDOR_AMD ||
549	    cpu_vendor_id == CPU_VENDOR_HYGON)
550		topo_probe_amd();
551	else if (cpu_vendor_id == CPU_VENDOR_INTEL)
552		topo_probe_intel();
553
554	KASSERT(pkg_id_shift >= core_id_shift,
555	    ("bug in APIC topology discovery"));
556
557	nlayers = 0;
558	bzero(topo_layers, sizeof(topo_layers));
559
560	topo_layers[nlayers].type = TOPO_TYPE_PKG;
561	topo_layers[nlayers].id_shift = pkg_id_shift;
562	if (bootverbose)
563		printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift);
564	nlayers++;
565
566	if (pkg_id_shift > node_id_shift && node_id_shift != 0) {
567		topo_layers[nlayers].type = TOPO_TYPE_GROUP;
568		topo_layers[nlayers].id_shift = node_id_shift;
569		if (bootverbose)
570			printf("Node ID shift: %u\n",
571			    topo_layers[nlayers].id_shift);
572		nlayers++;
573	}
574
575	/*
576	 * Consider all caches to be within a package/chip
577	 * and "in front" of all sub-components like
578	 * cores and hardware threads.
579	 */
580	for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) {
581		if (caches[i].present) {
582			if (node_id_shift != 0)
583				KASSERT(caches[i].id_shift <= node_id_shift,
584					("bug in APIC topology discovery"));
585			KASSERT(caches[i].id_shift <= pkg_id_shift,
586				("bug in APIC topology discovery"));
587			KASSERT(caches[i].id_shift >= core_id_shift,
588				("bug in APIC topology discovery"));
589
590			topo_layers[nlayers].type = TOPO_TYPE_CACHE;
591			topo_layers[nlayers].subtype = i + 1;
592			topo_layers[nlayers].id_shift = caches[i].id_shift;
593			if (bootverbose)
594				printf("L%u cache ID shift: %u\n",
595				    topo_layers[nlayers].subtype,
596				    topo_layers[nlayers].id_shift);
597			nlayers++;
598		}
599	}
600
601	if (pkg_id_shift > core_id_shift) {
602		topo_layers[nlayers].type = TOPO_TYPE_CORE;
603		topo_layers[nlayers].id_shift = core_id_shift;
604		if (bootverbose)
605			printf("Core ID shift: %u\n",
606			    topo_layers[nlayers].id_shift);
607		nlayers++;
608	}
609
610	topo_layers[nlayers].type = TOPO_TYPE_PU;
611	topo_layers[nlayers].id_shift = 0;
612	nlayers++;
613
614#if defined(DEV_ACPI) && MAXMEMDOM > 1
615	if (vm_ndomains > 1) {
616		for (layer = 0; layer < nlayers; ++layer) {
617			for (i = 0; i <= max_apic_id; ++i) {
618				if ((i & ((1 << topo_layers[layer].id_shift) - 1)) == 0)
619					domain = -1;
620				if (!cpu_info[i].cpu_present)
621					continue;
622				d = acpi_pxm_get_cpu_locality(i);
623				if (domain >= 0 && domain != d)
624					break;
625				domain = d;
626			}
627			if (i > max_apic_id)
628				break;
629		}
630		KASSERT(layer < nlayers, ("NUMA domain smaller than PU"));
631		memmove(&topo_layers[layer+1], &topo_layers[layer],
632		    sizeof(*topo_layers) * (nlayers - layer));
633		topo_layers[layer].type = TOPO_TYPE_NODE;
634		topo_layers[layer].subtype = CG_SHARE_NONE;
635		nlayers++;
636	}
637#endif
638
639	topo_init_root(&topo_root);
640	for (i = 0; i <= max_apic_id; ++i) {
641		if (!cpu_info[i].cpu_present)
642			continue;
643
644		parent = &topo_root;
645		for (layer = 0; layer < nlayers; ++layer) {
646#if defined(DEV_ACPI) && MAXMEMDOM > 1
647			if (topo_layers[layer].type == TOPO_TYPE_NODE) {
648				node_id = acpi_pxm_get_cpu_locality(i);
649			} else
650#endif
651				node_id = i >> topo_layers[layer].id_shift;
652			parent = topo_add_node_by_hwid(parent, node_id,
653			    topo_layers[layer].type,
654			    topo_layers[layer].subtype);
655		}
656	}
657
658	parent = &topo_root;
659	for (layer = 0; layer < nlayers; ++layer) {
660#if defined(DEV_ACPI) && MAXMEMDOM > 1
661		if (topo_layers[layer].type == TOPO_TYPE_NODE)
662			node_id = acpi_pxm_get_cpu_locality(boot_cpu_id);
663		else
664#endif
665			node_id = boot_cpu_id >> topo_layers[layer].id_shift;
666		node = topo_find_node_by_hwid(parent, node_id,
667		    topo_layers[layer].type,
668		    topo_layers[layer].subtype);
669		topo_promote_child(node);
670		parent = node;
671	}
672
673	cpu_topo_probed = 1;
674}
675
676/*
677 * Assign logical CPU IDs to local APICs.
678 */
679void
680assign_cpu_ids(void)
681{
682	struct topo_node *node;
683	u_int smt_mask;
684	int nhyper;
685
686	smt_mask = (1u << core_id_shift) - 1;
687
688	/*
689	 * Assign CPU IDs to local APIC IDs and disable any CPUs
690	 * beyond MAXCPU.  CPU 0 is always assigned to the BSP.
691	 */
692	mp_ncpus = 0;
693	nhyper = 0;
694	TOPO_FOREACH(node, &topo_root) {
695		if (node->type != TOPO_TYPE_PU)
696			continue;
697
698		if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask))
699			cpu_info[node->hwid].cpu_hyperthread = 1;
700
701		if (resource_disabled("lapic", node->hwid)) {
702			if (node->hwid != boot_cpu_id)
703				cpu_info[node->hwid].cpu_disabled = 1;
704			else
705				printf("Cannot disable BSP, APIC ID = %d\n",
706				    node->hwid);
707		}
708
709		if (!hyperthreading_allowed &&
710		    cpu_info[node->hwid].cpu_hyperthread)
711			cpu_info[node->hwid].cpu_disabled = 1;
712
713		if (mp_ncpus >= MAXCPU)
714			cpu_info[node->hwid].cpu_disabled = 1;
715
716		if (cpu_info[node->hwid].cpu_disabled) {
717			disabled_cpus++;
718			continue;
719		}
720
721		if (cpu_info[node->hwid].cpu_hyperthread)
722			nhyper++;
723
724		cpu_apic_ids[mp_ncpus] = node->hwid;
725		apic_cpuids[node->hwid] = mp_ncpus;
726		topo_set_pu_id(node, mp_ncpus);
727		mp_ncpus++;
728	}
729
730	KASSERT(mp_maxid >= mp_ncpus - 1,
731	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
732	    mp_ncpus));
733
734	mp_ncores = mp_ncpus - nhyper;
735	smp_threads_per_core = mp_ncpus / mp_ncores;
736}
737
738/*
739 * Print various information about the SMP system hardware and setup.
740 */
741void
742cpu_mp_announce(void)
743{
744	struct topo_node *node;
745	const char *hyperthread;
746	struct topo_analysis topology;
747
748	printf("FreeBSD/SMP: ");
749	if (topo_analyze(&topo_root, 1, &topology)) {
750		printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]);
751		if (topology.entities[TOPO_LEVEL_GROUP] > 1)
752			printf(" x %d groups",
753			    topology.entities[TOPO_LEVEL_GROUP]);
754		if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1)
755			printf(" x %d cache groups",
756			    topology.entities[TOPO_LEVEL_CACHEGROUP]);
757		if (topology.entities[TOPO_LEVEL_CORE] > 0)
758			printf(" x %d core(s)",
759			    topology.entities[TOPO_LEVEL_CORE]);
760		if (topology.entities[TOPO_LEVEL_THREAD] > 1)
761			printf(" x %d hardware threads",
762			    topology.entities[TOPO_LEVEL_THREAD]);
763	} else {
764		printf("Non-uniform topology");
765	}
766	printf("\n");
767
768	if (disabled_cpus) {
769		printf("FreeBSD/SMP Online: ");
770		if (topo_analyze(&topo_root, 0, &topology)) {
771			printf("%d package(s)",
772			    topology.entities[TOPO_LEVEL_PKG]);
773			if (topology.entities[TOPO_LEVEL_GROUP] > 1)
774				printf(" x %d groups",
775				    topology.entities[TOPO_LEVEL_GROUP]);
776			if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1)
777				printf(" x %d cache groups",
778				    topology.entities[TOPO_LEVEL_CACHEGROUP]);
779			if (topology.entities[TOPO_LEVEL_CORE] > 0)
780				printf(" x %d core(s)",
781				    topology.entities[TOPO_LEVEL_CORE]);
782			if (topology.entities[TOPO_LEVEL_THREAD] > 1)
783				printf(" x %d hardware threads",
784				    topology.entities[TOPO_LEVEL_THREAD]);
785		} else {
786			printf("Non-uniform topology");
787		}
788		printf("\n");
789	}
790
791	if (!bootverbose)
792		return;
793
794	TOPO_FOREACH(node, &topo_root) {
795		switch (node->type) {
796		case TOPO_TYPE_PKG:
797			printf("Package HW ID = %u\n", node->hwid);
798			break;
799		case TOPO_TYPE_CORE:
800			printf("\tCore HW ID = %u\n", node->hwid);
801			break;
802		case TOPO_TYPE_PU:
803			if (cpu_info[node->hwid].cpu_hyperthread)
804				hyperthread = "/HT";
805			else
806				hyperthread = "";
807
808			if (node->subtype == 0)
809				printf("\t\tCPU (AP%s): APIC ID: %u"
810				    "(disabled)\n", hyperthread, node->hwid);
811			else if (node->id == 0)
812				printf("\t\tCPU0 (BSP): APIC ID: %u\n",
813				    node->hwid);
814			else
815				printf("\t\tCPU%u (AP%s): APIC ID: %u\n",
816				    node->id, hyperthread, node->hwid);
817			break;
818		default:
819			/* ignored */
820			break;
821		}
822	}
823}
824
825/*
826 * Add a scheduling group, a group of logical processors sharing
827 * a particular cache (and, thus having an affinity), to the scheduling
828 * topology.
829 * This function recursively works on lower level caches.
830 */
831static void
832x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root)
833{
834	struct topo_node *node;
835	int nchildren;
836	int ncores;
837	int i;
838
839	KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE ||
840	    root->type == TOPO_TYPE_NODE || root->type == TOPO_TYPE_GROUP,
841	    ("x86topo_add_sched_group: bad type: %u", root->type));
842	CPU_COPY(&root->cpuset, &cg_root->cg_mask);
843	cg_root->cg_count = root->cpu_count;
844	if (root->type == TOPO_TYPE_CACHE)
845		cg_root->cg_level = root->subtype;
846	else
847		cg_root->cg_level = CG_SHARE_NONE;
848	if (root->type == TOPO_TYPE_NODE)
849		cg_root->cg_flags = CG_FLAG_NODE;
850	else
851		cg_root->cg_flags = 0;
852
853	/*
854	 * Check how many core nodes we have under the given root node.
855	 * If we have multiple logical processors, but not multiple
856	 * cores, then those processors must be hardware threads.
857	 */
858	ncores = 0;
859	node = root;
860	while (node != NULL) {
861		if (node->type != TOPO_TYPE_CORE) {
862			node = topo_next_node(root, node);
863			continue;
864		}
865
866		ncores++;
867		node = topo_next_nonchild_node(root, node);
868	}
869
870	if (cg_root->cg_level != CG_SHARE_NONE &&
871	    root->cpu_count > 1 && ncores < 2)
872		cg_root->cg_flags |= CG_FLAG_SMT;
873
874	/*
875	 * Find out how many cache nodes we have under the given root node.
876	 * We ignore cache nodes that cover all the same processors as the
877	 * root node.  Also, we do not descend below found cache nodes.
878	 * That is, we count top-level "non-redundant" caches under the root
879	 * node.
880	 */
881	nchildren = 0;
882	node = root;
883	while (node != NULL) {
884		/*
885		 * When some APICs are disabled by tunables, nodes can end up
886		 * with an empty cpuset. Nodes with an empty cpuset will be
887		 * translated into cpu groups with empty cpusets. smp_topo_fill
888		 * will then set cg_first and cg_last to -1. This isn't
889		 * correctly handled in all functions. E.g. when
890		 * cpu_search_lowest and cpu_search_highest loop through all
891		 * cpus, they call CPU_ISSET on cpu -1 which ends up in a
892		 * general protection fault.
893		 *
894		 * We could fix the scheduler to handle empty cpu groups
895		 * correctly. Nevertheless, empty cpu groups are causing
896		 * overhead for no value. So, it makes more sense to just don't
897		 * create them.
898		 */
899		if (CPU_EMPTY(&node->cpuset)) {
900			node = topo_next_node(root, node);
901			continue;
902		}
903		if (CPU_CMP(&node->cpuset, &root->cpuset) == 0) {
904			if (node->type == TOPO_TYPE_CACHE &&
905			    cg_root->cg_level < node->subtype)
906				cg_root->cg_level = node->subtype;
907			if (node->type == TOPO_TYPE_NODE)
908				cg_root->cg_flags |= CG_FLAG_NODE;
909			node = topo_next_node(root, node);
910			continue;
911		}
912		if (node->type != TOPO_TYPE_GROUP &&
913		    node->type != TOPO_TYPE_NODE &&
914		    node->type != TOPO_TYPE_CACHE) {
915			node = topo_next_node(root, node);
916			continue;
917		}
918		nchildren++;
919		node = topo_next_nonchild_node(root, node);
920	}
921
922	/*
923	 * We are not interested in nodes including only one CPU each.
924	 */
925	if (nchildren == root->cpu_count)
926		return;
927
928	/*
929	 * We are not interested in nodes without children.
930	 */
931	cg_root->cg_children = nchildren;
932	if (nchildren == 0)
933		return;
934
935	cg_root->cg_child = smp_topo_alloc(nchildren);
936
937	/*
938	 * Now find again the same cache nodes as above and recursively
939	 * build scheduling topologies for them.
940	 */
941	node = root;
942	i = 0;
943	while (node != NULL) {
944		if ((node->type != TOPO_TYPE_GROUP &&
945		    node->type != TOPO_TYPE_NODE &&
946		    node->type != TOPO_TYPE_CACHE) ||
947		    CPU_CMP(&node->cpuset, &root->cpuset) == 0 ||
948		    CPU_EMPTY(&node->cpuset)) {
949			node = topo_next_node(root, node);
950			continue;
951		}
952		cg_root->cg_child[i].cg_parent = cg_root;
953		x86topo_add_sched_group(node, &cg_root->cg_child[i]);
954		i++;
955		node = topo_next_nonchild_node(root, node);
956	}
957}
958
959/*
960 * Build the MI scheduling topology from the discovered hardware topology.
961 */
962struct cpu_group *
963cpu_topo(void)
964{
965	struct cpu_group *cg_root;
966
967	if (mp_ncpus <= 1)
968		return (smp_topo_none());
969
970	cg_root = smp_topo_alloc(1);
971	x86topo_add_sched_group(&topo_root, cg_root);
972	return (cg_root);
973}
974
975static void
976cpu_alloc(void *dummy __unused)
977{
978	/*
979	 * Dynamically allocate the arrays that depend on the
980	 * maximum APIC ID.
981	 */
982	cpu_info = malloc(sizeof(*cpu_info) * (max_apic_id + 1), M_CPUS,
983	    M_WAITOK | M_ZERO);
984	apic_cpuids = malloc(sizeof(*apic_cpuids) * (max_apic_id + 1), M_CPUS,
985	    M_WAITOK | M_ZERO);
986}
987SYSINIT(cpu_alloc, SI_SUB_CPU, SI_ORDER_FIRST, cpu_alloc, NULL);
988
989/*
990 * Add a logical CPU to the topology.
991 */
992void
993cpu_add(u_int apic_id, char boot_cpu)
994{
995
996	if (apic_id > max_apic_id)
997		panic("SMP: APIC ID %d too high", apic_id);
998
999	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %u added twice",
1000	    apic_id));
1001	cpu_info[apic_id].cpu_present = 1;
1002	if (boot_cpu) {
1003		KASSERT(boot_cpu_id == -1,
1004		    ("CPU %u claims to be BSP, but CPU %u already is", apic_id,
1005		    boot_cpu_id));
1006		boot_cpu_id = apic_id;
1007		cpu_info[apic_id].cpu_bsp = 1;
1008	}
1009	if (bootverbose)
1010		printf("SMP: Added CPU %u (%s)\n", apic_id, boot_cpu ? "BSP" :
1011		    "AP");
1012}
1013
1014void
1015cpu_mp_setmaxid(void)
1016{
1017
1018	/*
1019	 * mp_ncpus and mp_maxid should be already set by calls to cpu_add().
1020	 * If there were no calls to cpu_add() assume this is a UP system.
1021	 */
1022	if (mp_ncpus == 0)
1023		mp_ncpus = 1;
1024}
1025
1026int
1027cpu_mp_probe(void)
1028{
1029
1030	/*
1031	 * Always record BSP in CPU map so that the mbuf init code works
1032	 * correctly.
1033	 */
1034	CPU_SETOF(0, &all_cpus);
1035	return (mp_ncpus > 1);
1036}
1037
1038/*
1039 * AP CPU's call this to initialize themselves.
1040 */
1041void
1042init_secondary_tail(void)
1043{
1044	u_int cpuid;
1045
1046	pmap_activate_boot(vmspace_pmap(proc0.p_vmspace));
1047
1048	/*
1049	 * On real hardware, switch to x2apic mode if possible.  Do it
1050	 * after aps_ready was signalled, to avoid manipulating the
1051	 * mode while BSP might still want to send some IPI to us
1052	 * (second startup IPI is ignored on modern hardware etc).
1053	 */
1054	lapic_xapic_mode();
1055
1056	/* Initialize the PAT MSR. */
1057	pmap_init_pat();
1058
1059	/* set up CPU registers and state */
1060	cpu_setregs();
1061
1062	/* set up SSE/NX */
1063	initializecpu();
1064
1065	/* set up FPU state on the AP */
1066#ifdef __amd64__
1067	fpuinit();
1068#else
1069	npxinit(false);
1070#endif
1071
1072	if (cpu_ops.cpu_init)
1073		cpu_ops.cpu_init();
1074
1075	/* A quick check from sanity claus */
1076	cpuid = PCPU_GET(cpuid);
1077	if (PCPU_GET(apic_id) != lapic_id()) {
1078		printf("SMP: cpuid = %d\n", cpuid);
1079		printf("SMP: actual apic_id = %d\n", lapic_id());
1080		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
1081		panic("cpuid mismatch! boom!!");
1082	}
1083
1084	/* Initialize curthread. */
1085	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
1086	PCPU_SET(curthread, PCPU_GET(idlethread));
1087	schedinit_ap();
1088
1089	mtx_lock_spin(&ap_boot_mtx);
1090
1091	mca_init();
1092
1093	/* Init local apic for irq's */
1094	lapic_setup(1);
1095
1096	/* Set memory range attributes for this CPU to match the BSP */
1097	mem_range_AP_init();
1098
1099	smp_cpus++;
1100
1101	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid);
1102	if (bootverbose)
1103		printf("SMP: AP CPU #%d Launched!\n", cpuid);
1104	else
1105		printf("%s%d%s", smp_cpus == 2 ? "Launching APs: " : "",
1106		    cpuid, smp_cpus == mp_ncpus ? "\n" : " ");
1107
1108	/* Determine if we are a logical CPU. */
1109	if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread)
1110		CPU_SET(cpuid, &logical_cpus_mask);
1111
1112	if (bootverbose)
1113		lapic_dump("AP");
1114
1115	if (smp_cpus == mp_ncpus) {
1116		/* enable IPI's, tlb shootdown, freezes etc */
1117		atomic_store_rel_int(&smp_started, 1);
1118	}
1119
1120#ifdef __amd64__
1121	if (pmap_pcid_enabled)
1122		load_cr4(rcr4() | CR4_PCIDE);
1123	load_ds(_udatasel);
1124	load_es(_udatasel);
1125	load_fs(_ufssel);
1126#endif
1127
1128	mtx_unlock_spin(&ap_boot_mtx);
1129
1130	/* Wait until all the AP's are up. */
1131	while (atomic_load_acq_int(&smp_started) == 0)
1132		ia32_pause();
1133
1134	kcsan_cpu_init(cpuid);
1135
1136	sched_ap_entry();
1137
1138	panic("scheduler returned us to %s", __func__);
1139	/* NOTREACHED */
1140}
1141
1142static void
1143smp_after_idle_runnable(void *arg __unused)
1144{
1145	int cpu;
1146
1147	if (mp_ncpus == 1)
1148		return;
1149
1150	KASSERT(smp_started != 0, ("%s: SMP not started yet", __func__));
1151
1152	/*
1153	 * Wait for all APs to handle an interrupt.  After that, we know that
1154	 * the APs have entered the scheduler at least once, so the boot stacks
1155	 * are safe to free.
1156	 */
1157	smp_rendezvous(smp_no_rendezvous_barrier, NULL,
1158	    smp_no_rendezvous_barrier, NULL);
1159
1160	for (cpu = 1; cpu < mp_ncpus; cpu++) {
1161		kmem_free(bootstacks[cpu], kstack_pages * PAGE_SIZE);
1162	}
1163}
1164SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY,
1165    smp_after_idle_runnable, NULL);
1166
1167/*
1168 * We tell the I/O APIC code about all the CPUs we want to receive
1169 * interrupts.  If we don't want certain CPUs to receive IRQs we
1170 * can simply not tell the I/O APIC code about them in this function.
1171 * We also do not tell it about the BSP since it tells itself about
1172 * the BSP internally to work with UP kernels and on UP machines.
1173 */
1174void
1175set_interrupt_apic_ids(void)
1176{
1177	u_int i, apic_id;
1178
1179	for (i = 0; i < MAXCPU; i++) {
1180		apic_id = cpu_apic_ids[i];
1181		if (apic_id == -1)
1182			continue;
1183		if (cpu_info[apic_id].cpu_bsp)
1184			continue;
1185		if (cpu_info[apic_id].cpu_disabled)
1186			continue;
1187		if (intr_apic_id_limit >= 0 && apic_id > intr_apic_id_limit)
1188			continue;
1189
1190		/* Don't let hyperthreads service interrupts. */
1191		if (cpu_info[apic_id].cpu_hyperthread &&
1192		    !hyperthreading_intr_allowed)
1193			continue;
1194
1195		intr_add_cpu(i);
1196	}
1197}
1198
1199#ifdef COUNT_XINVLTLB_HITS
1200u_int xhits_gbl[MAXCPU];
1201u_int xhits_pg[MAXCPU];
1202u_int xhits_rng[MAXCPU];
1203static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1204    "");
1205SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
1206    sizeof(xhits_gbl), "IU", "");
1207SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
1208    sizeof(xhits_pg), "IU", "");
1209SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
1210    sizeof(xhits_rng), "IU", "");
1211
1212u_int ipi_global;
1213u_int ipi_page;
1214u_int ipi_range;
1215u_int ipi_range_size;
1216SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
1217SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
1218SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
1219SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
1220    0, "");
1221#endif /* COUNT_XINVLTLB_HITS */
1222
1223/*
1224 * Init and startup IPI.
1225 */
1226void
1227ipi_startup(int apic_id, int vector)
1228{
1229
1230	/*
1231	 * This attempts to follow the algorithm described in the
1232	 * Intel Multiprocessor Specification v1.4 in section B.4.
1233	 * For each IPI, we allow the local APIC ~20us to deliver the
1234	 * IPI.  If that times out, we panic.
1235	 */
1236
1237	/*
1238	 * first we do an INIT IPI: this INIT IPI might be run, resetting
1239	 * and running the target CPU. OR this INIT IPI might be latched (P5
1240	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
1241	 * ignored.
1242	 */
1243	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
1244	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
1245	lapic_ipi_wait(100);
1246
1247	/* Explicitly deassert the INIT IPI. */
1248	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
1249	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT,
1250	    apic_id);
1251
1252	DELAY(10000);		/* wait ~10mS */
1253
1254	/*
1255	 * next we do a STARTUP IPI: the previous INIT IPI might still be
1256	 * latched, (P5 bug) this 1st STARTUP would then terminate
1257	 * immediately, and the previously started INIT IPI would continue. OR
1258	 * the previous INIT IPI has already run. and this STARTUP IPI will
1259	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
1260	 * will run.
1261	 */
1262	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
1263	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
1264	    vector, apic_id);
1265	if (!lapic_ipi_wait(100))
1266		panic("Failed to deliver first STARTUP IPI to APIC %d",
1267		    apic_id);
1268	DELAY(200);		/* wait ~200uS */
1269
1270	/*
1271	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
1272	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
1273	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
1274	 * recognized after hardware RESET or INIT IPI.
1275	 */
1276	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
1277	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
1278	    vector, apic_id);
1279	if (!lapic_ipi_wait(100))
1280		panic("Failed to deliver second STARTUP IPI to APIC %d",
1281		    apic_id);
1282
1283	DELAY(200);		/* wait ~200uS */
1284}
1285
1286static bool
1287ipi_bitmap_set(int cpu, u_int ipi)
1288{
1289	u_int bitmap, old, new;
1290	u_int *cpu_bitmap;
1291
1292	bitmap = 1 << ipi;
1293	cpu_bitmap = &cpuid_to_pcpu[cpu]->pc_ipi_bitmap;
1294	old = *cpu_bitmap;
1295	for (;;) {
1296		if ((old & bitmap) != 0)
1297			break;
1298		new = old | bitmap;
1299		if (atomic_fcmpset_int(cpu_bitmap, &old, new))
1300			break;
1301	}
1302	return (old != 0);
1303}
1304
1305/*
1306 * Send an IPI to specified CPU handling the bitmap logic.
1307 */
1308static void
1309ipi_send_cpu(int cpu, u_int ipi)
1310{
1311
1312	KASSERT((u_int)cpu < MAXCPU && cpu_apic_ids[cpu] != -1,
1313	    ("IPI to non-existent CPU %d", cpu));
1314
1315	if (IPI_IS_BITMAPED(ipi)) {
1316		if (ipi_bitmap_set(cpu, ipi))
1317			return;
1318		ipi = IPI_BITMAP_VECTOR;
1319	}
1320	lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
1321}
1322
1323void
1324ipi_bitmap_handler(struct trapframe frame)
1325{
1326	struct trapframe *oldframe;
1327	struct thread *td;
1328	int cpu = PCPU_GET(cpuid);
1329	u_int ipi_bitmap;
1330
1331	kasan_mark(&frame, sizeof(frame), sizeof(frame), 0);
1332
1333	td = curthread;
1334	ipi_bitmap = atomic_readandclear_int(&cpuid_to_pcpu[cpu]->
1335	    pc_ipi_bitmap);
1336
1337	/*
1338	 * sched_preempt() must be called to clear the pending preempt
1339	 * IPI to enable delivery of further preempts.  However, the
1340	 * critical section will cause extra scheduler lock thrashing
1341	 * when used unconditionally.  Only critical_enter() if
1342	 * hardclock must also run, which requires the section entry.
1343	 */
1344	if (ipi_bitmap & (1 << IPI_HARDCLOCK))
1345		critical_enter();
1346
1347	td->td_intr_nesting_level++;
1348	oldframe = td->td_intr_frame;
1349	td->td_intr_frame = &frame;
1350#if defined(STACK) || defined(DDB)
1351	if (ipi_bitmap & (1 << IPI_TRACE))
1352		stack_capture_intr();
1353#endif
1354	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
1355#ifdef COUNT_IPIS
1356		(*ipi_preempt_counts[cpu])++;
1357#endif
1358		sched_preempt(td);
1359	}
1360	if (ipi_bitmap & (1 << IPI_AST)) {
1361#ifdef COUNT_IPIS
1362		(*ipi_ast_counts[cpu])++;
1363#endif
1364		/* Nothing to do for AST */
1365	}
1366	if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
1367#ifdef COUNT_IPIS
1368		(*ipi_hardclock_counts[cpu])++;
1369#endif
1370		hardclockintr();
1371	}
1372	td->td_intr_frame = oldframe;
1373	td->td_intr_nesting_level--;
1374	if (ipi_bitmap & (1 << IPI_HARDCLOCK))
1375		critical_exit();
1376}
1377
1378/*
1379 * send an IPI to a set of cpus.
1380 */
1381void
1382ipi_selected(cpuset_t cpus, u_int ipi)
1383{
1384	int cpu;
1385
1386	/*
1387	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1388	 * of help in order to understand what is the source.
1389	 * Set the mask of receiving CPUs for this purpose.
1390	 */
1391	if (ipi == IPI_STOP_HARD)
1392		CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus);
1393
1394	CPU_FOREACH_ISSET(cpu, &cpus) {
1395		CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1396		ipi_send_cpu(cpu, ipi);
1397	}
1398}
1399
1400/*
1401 * send an IPI to a specific CPU.
1402 */
1403void
1404ipi_cpu(int cpu, u_int ipi)
1405{
1406
1407	/*
1408	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1409	 * of help in order to understand what is the source.
1410	 * Set the mask of receiving CPUs for this purpose.
1411	 */
1412	if (ipi == IPI_STOP_HARD)
1413		CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending);
1414
1415	CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1416	ipi_send_cpu(cpu, ipi);
1417}
1418
1419/*
1420 * send an IPI to all CPUs EXCEPT myself
1421 */
1422void
1423ipi_all_but_self(u_int ipi)
1424{
1425	cpuset_t other_cpus;
1426	int cpu, c;
1427
1428	/*
1429	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1430	 * of help in order to understand what is the source.
1431	 * Set the mask of receiving CPUs for this purpose.
1432	 */
1433	if (ipi == IPI_STOP_HARD) {
1434		other_cpus = all_cpus;
1435		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
1436		CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus);
1437	}
1438
1439	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1440	if (IPI_IS_BITMAPED(ipi)) {
1441		cpu = PCPU_GET(cpuid);
1442		CPU_FOREACH(c) {
1443			if (c != cpu)
1444				ipi_bitmap_set(c, ipi);
1445		}
1446		ipi = IPI_BITMAP_VECTOR;
1447	}
1448	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
1449}
1450
1451void
1452ipi_self_from_nmi(u_int vector)
1453{
1454
1455	lapic_ipi_vectored(vector, APIC_IPI_DEST_SELF);
1456
1457	/* Wait for IPI to finish. */
1458	if (!lapic_ipi_wait(50000)) {
1459		if (KERNEL_PANICKED())
1460			return;
1461		else
1462			panic("APIC: IPI is stuck");
1463	}
1464}
1465
1466int
1467ipi_nmi_handler(void)
1468{
1469	u_int cpuid;
1470
1471	/*
1472	 * As long as there is not a simple way to know about a NMI's
1473	 * source, if the bitmask for the current CPU is present in
1474	 * the global pending bitword an IPI_STOP_HARD has been issued
1475	 * and should be handled.
1476	 */
1477	cpuid = PCPU_GET(cpuid);
1478	if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending))
1479		return (1);
1480
1481	CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending);
1482	cpustop_handler();
1483	return (0);
1484}
1485
1486int nmi_kdb_lock;
1487
1488void
1489nmi_call_kdb_smp(u_int type, struct trapframe *frame)
1490{
1491	int cpu;
1492	bool call_post;
1493
1494	cpu = PCPU_GET(cpuid);
1495	if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) {
1496		nmi_call_kdb(cpu, type, frame);
1497		call_post = false;
1498	} else {
1499		savectx(&stoppcbs[cpu]);
1500		CPU_SET_ATOMIC(cpu, &stopped_cpus);
1501		while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1))
1502			ia32_pause();
1503		call_post = true;
1504	}
1505	atomic_store_rel_int(&nmi_kdb_lock, 0);
1506	if (call_post)
1507		cpustop_handler_post(cpu);
1508}
1509
1510/*
1511 * Handle an IPI_STOP by saving our current context and spinning (or mwaiting,
1512 * if available) until we are resumed.
1513 */
1514void
1515cpustop_handler(void)
1516{
1517	struct monitorbuf *mb;
1518	u_int cpu;
1519	bool use_mwait;
1520
1521	cpu = PCPU_GET(cpuid);
1522
1523	savectx(&stoppcbs[cpu]);
1524
1525	use_mwait = (stop_mwait && (cpu_feature2 & CPUID2_MON) != 0 &&
1526	    !mwait_cpustop_broken);
1527	if (use_mwait) {
1528		mb = PCPU_PTR(monitorbuf);
1529		atomic_store_int(&mb->stop_state,
1530		    MONITOR_STOPSTATE_STOPPED);
1531	}
1532
1533	/* Indicate that we are stopped */
1534	CPU_SET_ATOMIC(cpu, &stopped_cpus);
1535
1536	/* Wait for restart */
1537	while (!CPU_ISSET(cpu, &started_cpus)) {
1538		if (use_mwait) {
1539			cpu_monitor(mb, 0, 0);
1540			if (atomic_load_int(&mb->stop_state) ==
1541			    MONITOR_STOPSTATE_STOPPED)
1542				cpu_mwait(0, MWAIT_C1);
1543			continue;
1544		}
1545
1546		ia32_pause();
1547
1548		/*
1549		 * Halt non-BSP CPUs on panic -- we're never going to need them
1550		 * again, and might as well save power / release resources
1551		 * (e.g., overprovisioned VM infrastructure).
1552		 */
1553		while (__predict_false(!IS_BSP() && KERNEL_PANICKED()))
1554			halt();
1555	}
1556
1557	cpustop_handler_post(cpu);
1558}
1559
1560static void
1561cpustop_handler_post(u_int cpu)
1562{
1563
1564	CPU_CLR_ATOMIC(cpu, &started_cpus);
1565	CPU_CLR_ATOMIC(cpu, &stopped_cpus);
1566
1567	/*
1568	 * We don't broadcast TLB invalidations to other CPUs when they are
1569	 * stopped. Hence, we clear the TLB before resuming.
1570	 */
1571	invltlb_glob();
1572
1573#if defined(__amd64__) && (defined(DDB) || defined(GDB))
1574	amd64_db_resume_dbreg();
1575#endif
1576
1577	if (cpu == 0 && cpustop_restartfunc != NULL) {
1578		cpustop_restartfunc();
1579		cpustop_restartfunc = NULL;
1580	}
1581}
1582
1583/*
1584 * Handle an IPI_SUSPEND by saving our current context and spinning until we
1585 * are resumed.
1586 */
1587void
1588cpususpend_handler(void)
1589{
1590	u_int cpu;
1591
1592	mtx_assert(&smp_ipi_mtx, MA_NOTOWNED);
1593
1594	cpu = PCPU_GET(cpuid);
1595
1596#ifdef XENHVM
1597	/*
1598	 * Some Xen guest types (PVH) expose a very minimal set of ACPI tables,
1599	 * and for example have no support for SCI.  That leads to the suspend
1600	 * stacks not being allocated, and hence when attempting to perform a
1601	 * Xen triggered suspension FreeBSD will hit a #PF.  Avoid saving the
1602	 * CPU and FPU contexts if the stacks are not allocated, as the
1603	 * hypervisor will already take care of this.  Note that we could even
1604	 * do this for Xen triggered suspensions on guests that have full ACPI
1605	 * support, but doing so would introduce extra complexity.
1606	 */
1607	if (susppcbs == NULL) {
1608		KASSERT(vm_guest == VM_GUEST_XEN, ("Missing suspend stack"));
1609		CPU_SET_ATOMIC(cpu, &suspended_cpus);
1610		CPU_SET_ATOMIC(cpu, &resuming_cpus);
1611	} else
1612#endif
1613	if (savectx(&susppcbs[cpu]->sp_pcb)) {
1614#ifdef __amd64__
1615		fpususpend(susppcbs[cpu]->sp_fpususpend);
1616#else
1617		npxsuspend(susppcbs[cpu]->sp_fpususpend);
1618#endif
1619		/*
1620		 * suspended_cpus is cleared shortly after each AP is restarted
1621		 * by a Startup IPI, so that the BSP can proceed to restarting
1622		 * the next AP.
1623		 *
1624		 * resuming_cpus gets cleared when the AP completes
1625		 * initialization after having been released by the BSP.
1626		 * resuming_cpus is probably not the best name for the
1627		 * variable, because it is actually a set of processors that
1628		 * haven't resumed yet and haven't necessarily started resuming.
1629		 *
1630		 * Note that suspended_cpus is meaningful only for ACPI suspend
1631		 * as it's not really used for Xen suspend since the APs are
1632		 * automatically restored to the running state and the correct
1633		 * context.  For the same reason resumectx is never called in
1634		 * that case.
1635		 */
1636		CPU_SET_ATOMIC(cpu, &suspended_cpus);
1637		CPU_SET_ATOMIC(cpu, &resuming_cpus);
1638
1639		/*
1640		 * Invalidate the cache after setting the global status bits.
1641		 * The last AP to set its bit may end up being an Owner of the
1642		 * corresponding cache line in MOESI protocol.  The AP may be
1643		 * stopped before the cache line is written to the main memory.
1644		 */
1645		wbinvd();
1646	} else {
1647#ifdef __amd64__
1648		fpuresume(susppcbs[cpu]->sp_fpususpend);
1649#else
1650		npxresume(susppcbs[cpu]->sp_fpususpend);
1651#endif
1652		pmap_init_pat();
1653		initializecpu();
1654		PCPU_SET(switchtime, 0);
1655		PCPU_SET(switchticks, ticks);
1656
1657		/* Indicate that we have restarted and restored the context. */
1658		CPU_CLR_ATOMIC(cpu, &suspended_cpus);
1659	}
1660
1661	/* Wait for resume directive */
1662	while (!CPU_ISSET(cpu, &toresume_cpus))
1663		ia32_pause();
1664
1665	/* Re-apply microcode updates. */
1666	ucode_reload();
1667
1668#ifdef __i386__
1669	/* Finish removing the identity mapping of low memory for this AP. */
1670	invltlb_glob();
1671#endif
1672
1673	if (cpu_ops.cpu_resume)
1674		cpu_ops.cpu_resume();
1675#ifdef __amd64__
1676	if (vmm_resume_p)
1677		vmm_resume_p();
1678#endif
1679
1680	/* Resume MCA and local APIC */
1681	lapic_xapic_mode();
1682	mca_resume();
1683	lapic_setup(0);
1684
1685	/* Indicate that we are resumed */
1686	CPU_CLR_ATOMIC(cpu, &resuming_cpus);
1687	CPU_CLR_ATOMIC(cpu, &suspended_cpus);
1688	CPU_CLR_ATOMIC(cpu, &toresume_cpus);
1689}
1690
1691/*
1692 * Handle an IPI_SWI by waking delayed SWI thread.
1693 */
1694void
1695ipi_swi_handler(struct trapframe frame)
1696{
1697
1698	intr_event_handle(clk_intr_event, &frame);
1699}
1700
1701/*
1702 * This is called once the rest of the system is up and running and we're
1703 * ready to let the AP's out of the pen.
1704 */
1705static void
1706release_aps(void *dummy __unused)
1707{
1708
1709	if (mp_ncpus == 1)
1710		return;
1711	atomic_store_rel_int(&aps_ready, 1);
1712	while (smp_started == 0)
1713		ia32_pause();
1714}
1715SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1716
1717#ifdef COUNT_IPIS
1718/*
1719 * Setup interrupt counters for IPI handlers.
1720 */
1721static void
1722mp_ipi_intrcnt(void *dummy)
1723{
1724	char buf[64];
1725	int i;
1726
1727	CPU_FOREACH(i) {
1728		snprintf(buf, sizeof(buf), "cpu%d:invltlb", i);
1729		intrcnt_add(buf, &ipi_invltlb_counts[i]);
1730		snprintf(buf, sizeof(buf), "cpu%d:invlrng", i);
1731		intrcnt_add(buf, &ipi_invlrng_counts[i]);
1732		snprintf(buf, sizeof(buf), "cpu%d:invlpg", i);
1733		intrcnt_add(buf, &ipi_invlpg_counts[i]);
1734		snprintf(buf, sizeof(buf), "cpu%d:invlcache", i);
1735		intrcnt_add(buf, &ipi_invlcache_counts[i]);
1736		snprintf(buf, sizeof(buf), "cpu%d:preempt", i);
1737		intrcnt_add(buf, &ipi_preempt_counts[i]);
1738		snprintf(buf, sizeof(buf), "cpu%d:ast", i);
1739		intrcnt_add(buf, &ipi_ast_counts[i]);
1740		snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i);
1741		intrcnt_add(buf, &ipi_rendezvous_counts[i]);
1742		snprintf(buf, sizeof(buf), "cpu%d:hardclock", i);
1743		intrcnt_add(buf, &ipi_hardclock_counts[i]);
1744	}
1745}
1746SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
1747#endif
1748