mp_x86.c revision 337118
1/*-
2 * Copyright (c) 1996, by Steve Passe
3 * Copyright (c) 2003, by Peter Wemm
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. The name of the developer may NOT be used to endorse or promote products
12 *    derived from this software without specific prior written permission.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: stable/11/sys/x86/x86/mp_x86.c 337118 2018-08-02 09:00:09Z avg $");
29
30#ifdef __i386__
31#include "opt_apic.h"
32#endif
33#include "opt_cpu.h"
34#include "opt_kstack_pages.h"
35#include "opt_pmap.h"
36#include "opt_sched.h"
37#include "opt_smp.h"
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/bus.h>
42#include <sys/cons.h>	/* cngetc() */
43#include <sys/cpuset.h>
44#ifdef GPROF
45#include <sys/gmon.h>
46#endif
47#include <sys/kernel.h>
48#include <sys/ktr.h>
49#include <sys/lock.h>
50#include <sys/malloc.h>
51#include <sys/memrange.h>
52#include <sys/mutex.h>
53#include <sys/pcpu.h>
54#include <sys/proc.h>
55#include <sys/sched.h>
56#include <sys/smp.h>
57#include <sys/sysctl.h>
58
59#include <vm/vm.h>
60#include <vm/vm_param.h>
61#include <vm/pmap.h>
62#include <vm/vm_kern.h>
63#include <vm/vm_extern.h>
64
65#include <x86/apicreg.h>
66#include <machine/clock.h>
67#include <machine/cputypes.h>
68#include <x86/mca.h>
69#include <machine/md_var.h>
70#include <machine/pcb.h>
71#include <machine/psl.h>
72#include <machine/smp.h>
73#include <machine/specialreg.h>
74#include <machine/cpu.h>
75
76/* lock region used by kernel profiling */
77int	mcount_lock;
78
79int	mp_naps;		/* # of Applications processors */
80int	boot_cpu_id = -1;	/* designated BSP */
81
82extern	struct pcpu __pcpu[];
83
84/* AP uses this during bootstrap.  Do not staticize.  */
85char *bootSTK;
86int bootAP;
87
88/* Free these after use */
89void *bootstacks[MAXCPU];
90void *dpcpu;
91
92struct pcb stoppcbs[MAXCPU];
93struct susppcb **susppcbs;
94
95#ifdef COUNT_IPIS
96/* Interrupt counts. */
97static u_long *ipi_preempt_counts[MAXCPU];
98static u_long *ipi_ast_counts[MAXCPU];
99u_long *ipi_invltlb_counts[MAXCPU];
100u_long *ipi_invlrng_counts[MAXCPU];
101u_long *ipi_invlpg_counts[MAXCPU];
102u_long *ipi_invlcache_counts[MAXCPU];
103u_long *ipi_rendezvous_counts[MAXCPU];
104static u_long *ipi_hardclock_counts[MAXCPU];
105#endif
106
107/* Default cpu_ops implementation. */
108struct cpu_ops cpu_ops;
109
110/*
111 * Local data and functions.
112 */
113
114static volatile cpuset_t ipi_stop_nmi_pending;
115
116volatile cpuset_t resuming_cpus;
117volatile cpuset_t toresume_cpus;
118
119/* used to hold the AP's until we are ready to release them */
120struct mtx ap_boot_mtx;
121
122/* Set to 1 once we're ready to let the APs out of the pen. */
123volatile int aps_ready = 0;
124
125/*
126 * Store data from cpu_add() until later in the boot when we actually setup
127 * the APs.
128 */
129struct cpu_info cpu_info[MAX_APIC_ID + 1];
130int apic_cpuids[MAX_APIC_ID + 1];
131int cpu_apic_ids[MAXCPU];
132
133/* Holds pending bitmap based IPIs per CPU */
134volatile u_int cpu_ipi_pending[MAXCPU];
135
136static void	release_aps(void *dummy);
137static void	cpustop_handler_post(u_int cpu);
138
139static int	hyperthreading_allowed = 1;
140SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN,
141	&hyperthreading_allowed, 0, "Use Intel HTT logical CPUs");
142
143static struct topo_node topo_root;
144
145static int pkg_id_shift;
146static int core_id_shift;
147static int disabled_cpus;
148
149struct cache_info {
150	int	id_shift;
151	int	present;
152} static caches[MAX_CACHE_LEVELS];
153
154void
155mem_range_AP_init(void)
156{
157
158	if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
159		mem_range_softc.mr_op->initAP(&mem_range_softc);
160}
161
162/*
163 * Round up to the next power of two, if necessary, and then
164 * take log2.
165 * Returns -1 if argument is zero.
166 */
167static __inline int
168mask_width(u_int x)
169{
170
171	return (fls(x << (1 - powerof2(x))) - 1);
172}
173
174/*
175 * Add a cache level to the cache topology description.
176 */
177static int
178add_deterministic_cache(int type, int level, int share_count)
179{
180
181	if (type == 0)
182		return (0);
183	if (type > 3) {
184		printf("unexpected cache type %d\n", type);
185		return (1);
186	}
187	if (type == 2) /* ignore instruction cache */
188		return (1);
189	if (level == 0 || level > MAX_CACHE_LEVELS) {
190		printf("unexpected cache level %d\n", type);
191		return (1);
192	}
193
194	if (caches[level - 1].present) {
195		printf("WARNING: multiple entries for L%u data cache\n", level);
196		printf("%u => %u\n", caches[level - 1].id_shift,
197		    mask_width(share_count));
198	}
199	caches[level - 1].id_shift = mask_width(share_count);
200	caches[level - 1].present = 1;
201
202	if (caches[level - 1].id_shift > pkg_id_shift) {
203		printf("WARNING: L%u data cache covers more "
204		    "APIC IDs than a package\n", level);
205		printf("%u > %u\n", caches[level - 1].id_shift, pkg_id_shift);
206		caches[level - 1].id_shift = pkg_id_shift;
207	}
208	if (caches[level - 1].id_shift < core_id_shift) {
209		printf("WARNING: L%u data cache covers less "
210		    "APIC IDs than a core\n", level);
211		printf("%u < %u\n", caches[level - 1].id_shift, core_id_shift);
212		caches[level - 1].id_shift = core_id_shift;
213	}
214
215	return (1);
216}
217
218/*
219 * Determine topology of processing units and caches for AMD CPUs.
220 * See:
221 *  - AMD CPUID Specification (Publication # 25481)
222 *  - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559)
223 *  - BKDG For AMD Family 10h Processors (Publication # 31116)
224 *  - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301)
225 *  - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751)
226 */
227static void
228topo_probe_amd(void)
229{
230	u_int p[4];
231	uint64_t v;
232	int level;
233	int nodes_per_socket;
234	int share_count;
235	int type;
236	int i;
237
238	/* No multi-core capability. */
239	if ((amd_feature2 & AMDID2_CMP) == 0)
240		return;
241
242	/* For families 10h and newer. */
243	pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
244	    AMDID_COREID_SIZE_SHIFT;
245
246	/* For 0Fh family. */
247	if (pkg_id_shift == 0)
248		pkg_id_shift =
249		    mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1);
250
251	/*
252	 * Families prior to 16h define the following value as
253	 * cores per compute unit and we don't really care about the AMD
254	 * compute units at the moment.  Perhaps we should treat them as
255	 * cores and cores within the compute units as hardware threads,
256	 * but that's up for debate.
257	 * Later families define the value as threads per compute unit,
258	 * so we are following AMD's nomenclature here.
259	 */
260	if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 &&
261	    CPUID_TO_FAMILY(cpu_id) >= 0x16) {
262		cpuid_count(0x8000001e, 0, p);
263		share_count = ((p[1] >> 8) & 0xff) + 1;
264		core_id_shift = mask_width(share_count);
265	}
266
267	if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) {
268		for (i = 0; ; i++) {
269			cpuid_count(0x8000001d, i, p);
270			type = p[0] & 0x1f;
271			level = (p[0] >> 5) & 0x7;
272			share_count = 1 + ((p[0] >> 14) & 0xfff);
273
274			if (!add_deterministic_cache(type, level, share_count))
275				break;
276		}
277	} else {
278		if (cpu_exthigh >= 0x80000005) {
279			cpuid_count(0x80000005, 0, p);
280			if (((p[2] >> 24) & 0xff) != 0) {
281				caches[0].id_shift = 0;
282				caches[0].present = 1;
283			}
284		}
285		if (cpu_exthigh >= 0x80000006) {
286			cpuid_count(0x80000006, 0, p);
287			if (((p[2] >> 16) & 0xffff) != 0) {
288				caches[1].id_shift = 0;
289				caches[1].present = 1;
290			}
291			if (((p[3] >> 18) & 0x3fff) != 0) {
292				nodes_per_socket = 1;
293				if ((amd_feature2 & AMDID2_NODE_ID) != 0) {
294					/*
295					 * Handle multi-node processors that
296					 * have multiple chips, each with its
297					 * own L3 cache, on the same die.
298					 */
299					v = rdmsr(0xc001100c);
300					nodes_per_socket = 1 + ((v >> 3) & 0x7);
301				}
302				caches[2].id_shift =
303				    pkg_id_shift - mask_width(nodes_per_socket);
304				caches[2].present = 1;
305			}
306		}
307	}
308}
309
310/*
311 * Determine topology of processing units for Intel CPUs
312 * using CPUID Leaf 1 and Leaf 4, if supported.
313 * See:
314 *  - Intel 64 Architecture Processor Topology Enumeration
315 *  - Intel 64 and IA-32 ArchitecturesSoftware Developer���s Manual,
316 *    Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
317 *    FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
318 */
319static void
320topo_probe_intel_0x4(void)
321{
322	u_int p[4];
323	int max_cores;
324	int max_logical;
325
326	/* Both zero and one here mean one logical processor per package. */
327	max_logical = (cpu_feature & CPUID_HTT) != 0 ?
328	    (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
329	if (max_logical <= 1)
330		return;
331
332	if (cpu_high >= 0x4) {
333		cpuid_count(0x04, 0, p);
334		max_cores = ((p[0] >> 26) & 0x3f) + 1;
335	} else
336		max_cores = 1;
337
338	core_id_shift = mask_width(max_logical/max_cores);
339	KASSERT(core_id_shift >= 0,
340	    ("intel topo: max_cores > max_logical\n"));
341	pkg_id_shift = core_id_shift + mask_width(max_cores);
342}
343
344/*
345 * Determine topology of processing units for Intel CPUs
346 * using CPUID Leaf 11, if supported.
347 * See:
348 *  - Intel 64 Architecture Processor Topology Enumeration
349 *  - Intel 64 and IA-32 ArchitecturesSoftware Developer���s Manual,
350 *    Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
351 *    FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
352 */
353static void
354topo_probe_intel_0xb(void)
355{
356	u_int p[4];
357	int bits;
358	int type;
359	int i;
360
361	/* Fall back if CPU leaf 11 doesn't really exist. */
362	cpuid_count(0x0b, 0, p);
363	if (p[1] == 0) {
364		topo_probe_intel_0x4();
365		return;
366	}
367
368	/* We only support three levels for now. */
369	for (i = 0; ; i++) {
370		cpuid_count(0x0b, i, p);
371
372		bits = p[0] & 0x1f;
373		type = (p[2] >> 8) & 0xff;
374
375		if (type == 0)
376			break;
377
378		/* TODO: check for duplicate (re-)assignment */
379		if (type == CPUID_TYPE_SMT)
380			core_id_shift = bits;
381		else if (type == CPUID_TYPE_CORE)
382			pkg_id_shift = bits;
383		else
384			printf("unknown CPU level type %d\n", type);
385	}
386
387	if (pkg_id_shift < core_id_shift) {
388		printf("WARNING: core covers more APIC IDs than a package\n");
389		core_id_shift = pkg_id_shift;
390	}
391}
392
393/*
394 * Determine topology of caches for Intel CPUs.
395 * See:
396 *  - Intel 64 Architecture Processor Topology Enumeration
397 *  - Intel 64 and IA-32 Architectures Software Developer���s Manual
398 *    Volume 2A: Instruction Set Reference, A-M,
399 *    CPUID instruction
400 */
401static void
402topo_probe_intel_caches(void)
403{
404	u_int p[4];
405	int level;
406	int share_count;
407	int type;
408	int i;
409
410	if (cpu_high < 0x4) {
411		/*
412		 * Available cache level and sizes can be determined
413		 * via CPUID leaf 2, but that requires a huge table of hardcoded
414		 * values, so for now just assume L1 and L2 caches potentially
415		 * shared only by HTT processing units, if HTT is present.
416		 */
417		caches[0].id_shift = pkg_id_shift;
418		caches[0].present = 1;
419		caches[1].id_shift = pkg_id_shift;
420		caches[1].present = 1;
421		return;
422	}
423
424	for (i = 0; ; i++) {
425		cpuid_count(0x4, i, p);
426		type = p[0] & 0x1f;
427		level = (p[0] >> 5) & 0x7;
428		share_count = 1 + ((p[0] >> 14) & 0xfff);
429
430		if (!add_deterministic_cache(type, level, share_count))
431			break;
432	}
433}
434
435/*
436 * Determine topology of processing units and caches for Intel CPUs.
437 * See:
438 *  - Intel 64 Architecture Processor Topology Enumeration
439 */
440static void
441topo_probe_intel(void)
442{
443
444	/*
445	 * Note that 0x1 <= cpu_high < 4 case should be
446	 * compatible with topo_probe_intel_0x4() logic when
447	 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
448	 * or it should trigger the fallback otherwise.
449	 */
450	if (cpu_high >= 0xb)
451		topo_probe_intel_0xb();
452	else if (cpu_high >= 0x1)
453		topo_probe_intel_0x4();
454
455	topo_probe_intel_caches();
456}
457
458/*
459 * Topology information is queried only on BSP, on which this
460 * code runs and for which it can query CPUID information.
461 * Then topology is extrapolated on all packages using an
462 * assumption that APIC ID to hardware component ID mapping is
463 * homogenious.
464 * That doesn't necesserily imply that the topology is uniform.
465 */
466void
467topo_probe(void)
468{
469	static int cpu_topo_probed = 0;
470	struct x86_topo_layer {
471		int type;
472		int subtype;
473		int id_shift;
474	} topo_layers[MAX_CACHE_LEVELS + 3];
475	struct topo_node *parent;
476	struct topo_node *node;
477	int layer;
478	int nlayers;
479	int node_id;
480	int i;
481
482	if (cpu_topo_probed)
483		return;
484
485	CPU_ZERO(&logical_cpus_mask);
486
487	if (mp_ncpus <= 1)
488		; /* nothing */
489	else if (cpu_vendor_id == CPU_VENDOR_AMD)
490		topo_probe_amd();
491	else if (cpu_vendor_id == CPU_VENDOR_INTEL)
492		topo_probe_intel();
493
494	KASSERT(pkg_id_shift >= core_id_shift,
495	    ("bug in APIC topology discovery"));
496
497	nlayers = 0;
498	bzero(topo_layers, sizeof(topo_layers));
499
500	topo_layers[nlayers].type = TOPO_TYPE_PKG;
501	topo_layers[nlayers].id_shift = pkg_id_shift;
502	if (bootverbose)
503		printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift);
504	nlayers++;
505
506	/*
507	 * Consider all caches to be within a package/chip
508	 * and "in front" of all sub-components like
509	 * cores and hardware threads.
510	 */
511	for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) {
512		if (caches[i].present) {
513			KASSERT(caches[i].id_shift <= pkg_id_shift,
514				("bug in APIC topology discovery"));
515			KASSERT(caches[i].id_shift >= core_id_shift,
516				("bug in APIC topology discovery"));
517
518			topo_layers[nlayers].type = TOPO_TYPE_CACHE;
519			topo_layers[nlayers].subtype = i + 1;
520			topo_layers[nlayers].id_shift = caches[i].id_shift;
521			if (bootverbose)
522				printf("L%u cache ID shift: %u\n",
523				    topo_layers[nlayers].subtype,
524				    topo_layers[nlayers].id_shift);
525			nlayers++;
526		}
527	}
528
529	if (pkg_id_shift > core_id_shift) {
530		topo_layers[nlayers].type = TOPO_TYPE_CORE;
531		topo_layers[nlayers].id_shift = core_id_shift;
532		if (bootverbose)
533			printf("Core ID shift: %u\n",
534			    topo_layers[nlayers].id_shift);
535		nlayers++;
536	}
537
538	topo_layers[nlayers].type = TOPO_TYPE_PU;
539	topo_layers[nlayers].id_shift = 0;
540	nlayers++;
541
542	topo_init_root(&topo_root);
543	for (i = 0; i <= MAX_APIC_ID; ++i) {
544		if (!cpu_info[i].cpu_present)
545			continue;
546
547		parent = &topo_root;
548		for (layer = 0; layer < nlayers; ++layer) {
549			node_id = i >> topo_layers[layer].id_shift;
550			parent = topo_add_node_by_hwid(parent, node_id,
551			    topo_layers[layer].type,
552			    topo_layers[layer].subtype);
553		}
554	}
555
556	parent = &topo_root;
557	for (layer = 0; layer < nlayers; ++layer) {
558		node_id = boot_cpu_id >> topo_layers[layer].id_shift;
559		node = topo_find_node_by_hwid(parent, node_id,
560		    topo_layers[layer].type,
561		    topo_layers[layer].subtype);
562		topo_promote_child(node);
563		parent = node;
564	}
565
566	cpu_topo_probed = 1;
567}
568
569/*
570 * Assign logical CPU IDs to local APICs.
571 */
572void
573assign_cpu_ids(void)
574{
575	struct topo_node *node;
576	u_int smt_mask;
577
578	smt_mask = (1u << core_id_shift) - 1;
579
580	/*
581	 * Assign CPU IDs to local APIC IDs and disable any CPUs
582	 * beyond MAXCPU.  CPU 0 is always assigned to the BSP.
583	 */
584	mp_ncpus = 0;
585	TOPO_FOREACH(node, &topo_root) {
586		if (node->type != TOPO_TYPE_PU)
587			continue;
588
589		if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask))
590			cpu_info[node->hwid].cpu_hyperthread = 1;
591
592		if (resource_disabled("lapic", node->hwid)) {
593			if (node->hwid != boot_cpu_id)
594				cpu_info[node->hwid].cpu_disabled = 1;
595			else
596				printf("Cannot disable BSP, APIC ID = %d\n",
597				    node->hwid);
598		}
599
600		if (!hyperthreading_allowed &&
601		    cpu_info[node->hwid].cpu_hyperthread)
602			cpu_info[node->hwid].cpu_disabled = 1;
603
604		if (mp_ncpus >= MAXCPU)
605			cpu_info[node->hwid].cpu_disabled = 1;
606
607		if (cpu_info[node->hwid].cpu_disabled) {
608			disabled_cpus++;
609			continue;
610		}
611
612		cpu_apic_ids[mp_ncpus] = node->hwid;
613		apic_cpuids[node->hwid] = mp_ncpus;
614		topo_set_pu_id(node, mp_ncpus);
615		mp_ncpus++;
616	}
617
618	KASSERT(mp_maxid >= mp_ncpus - 1,
619	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
620	    mp_ncpus));
621}
622
623/*
624 * Print various information about the SMP system hardware and setup.
625 */
626void
627cpu_mp_announce(void)
628{
629	struct topo_node *node;
630	const char *hyperthread;
631	int pkg_count;
632	int cores_per_pkg;
633	int thrs_per_core;
634
635	printf("FreeBSD/SMP: ");
636	if (topo_analyze(&topo_root, 1, &pkg_count,
637	    &cores_per_pkg, &thrs_per_core)) {
638		printf("%d package(s)", pkg_count);
639		if (cores_per_pkg > 0)
640			printf(" x %d core(s)", cores_per_pkg);
641		if (thrs_per_core > 1)
642		    printf(" x %d hardware threads", thrs_per_core);
643	} else {
644		printf("Non-uniform topology");
645	}
646	printf("\n");
647
648	if (disabled_cpus) {
649		printf("FreeBSD/SMP Online: ");
650		if (topo_analyze(&topo_root, 0, &pkg_count,
651		    &cores_per_pkg, &thrs_per_core)) {
652			printf("%d package(s)", pkg_count);
653			if (cores_per_pkg > 0)
654				printf(" x %d core(s)", cores_per_pkg);
655			if (thrs_per_core > 1)
656			    printf(" x %d hardware threads", thrs_per_core);
657		} else {
658			printf("Non-uniform topology");
659		}
660		printf("\n");
661	}
662
663	if (!bootverbose)
664		return;
665
666	TOPO_FOREACH(node, &topo_root) {
667		switch (node->type) {
668		case TOPO_TYPE_PKG:
669			printf("Package HW ID = %u (%#x)\n",
670			    node->hwid, node->hwid);
671			break;
672		case TOPO_TYPE_CORE:
673			printf("\tCore HW ID = %u (%#x)\n",
674			    node->hwid, node->hwid);
675			break;
676		case TOPO_TYPE_PU:
677			if (cpu_info[node->hwid].cpu_hyperthread)
678				hyperthread = "/HT";
679			else
680				hyperthread = "";
681
682			if (node->subtype == 0)
683				printf("\t\tCPU (AP%s): APIC ID: %u (%#x)"
684				    "(disabled)\n", hyperthread, node->hwid,
685				    node->hwid);
686			else if (node->id == 0)
687				printf("\t\tCPU0 (BSP): APIC ID: %u (%#x)\n",
688				    node->hwid, node->hwid);
689			else
690				printf("\t\tCPU%u (AP%s): APIC ID: %u (%#x)\n",
691				    node->id, hyperthread, node->hwid,
692				    node->hwid);
693			break;
694		default:
695			/* ignored */
696			break;
697		}
698	}
699}
700
701/*
702 * Add a scheduling group, a group of logical processors sharing
703 * a particular cache (and, thus having an affinity), to the scheduling
704 * topology.
705 * This function recursively works on lower level caches.
706 */
707static void
708x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root)
709{
710	struct topo_node *node;
711	int nchildren;
712	int ncores;
713	int i;
714
715	KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE,
716	    ("x86topo_add_sched_group: bad type: %u", root->type));
717	CPU_COPY(&root->cpuset, &cg_root->cg_mask);
718	cg_root->cg_count = root->cpu_count;
719	if (root->type == TOPO_TYPE_SYSTEM)
720		cg_root->cg_level = CG_SHARE_NONE;
721	else
722		cg_root->cg_level = root->subtype;
723
724	/*
725	 * Check how many core nodes we have under the given root node.
726	 * If we have multiple logical processors, but not multiple
727	 * cores, then those processors must be hardware threads.
728	 */
729	ncores = 0;
730	node = root;
731	while (node != NULL) {
732		if (node->type != TOPO_TYPE_CORE) {
733			node = topo_next_node(root, node);
734			continue;
735		}
736
737		ncores++;
738		node = topo_next_nonchild_node(root, node);
739	}
740
741	if (cg_root->cg_level != CG_SHARE_NONE &&
742	    root->cpu_count > 1 && ncores < 2)
743		cg_root->cg_flags = CG_FLAG_SMT;
744
745	/*
746	 * Find out how many cache nodes we have under the given root node.
747	 * We ignore cache nodes that cover all the same processors as the
748	 * root node.  Also, we do not descend below found cache nodes.
749	 * That is, we count top-level "non-redundant" caches under the root
750	 * node.
751	 */
752	nchildren = 0;
753	node = root;
754	while (node != NULL) {
755		if (node->type != TOPO_TYPE_CACHE ||
756		    (root->type != TOPO_TYPE_SYSTEM &&
757		    CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
758			node = topo_next_node(root, node);
759			continue;
760		}
761		nchildren++;
762		node = topo_next_nonchild_node(root, node);
763	}
764
765	cg_root->cg_child = smp_topo_alloc(nchildren);
766	cg_root->cg_children = nchildren;
767
768	/*
769	 * Now find again the same cache nodes as above and recursively
770	 * build scheduling topologies for them.
771	 */
772	node = root;
773	i = 0;
774	while (node != NULL) {
775		if (node->type != TOPO_TYPE_CACHE ||
776		    (root->type != TOPO_TYPE_SYSTEM &&
777		    CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
778			node = topo_next_node(root, node);
779			continue;
780		}
781		cg_root->cg_child[i].cg_parent = cg_root;
782		x86topo_add_sched_group(node, &cg_root->cg_child[i]);
783		i++;
784		node = topo_next_nonchild_node(root, node);
785	}
786}
787
788/*
789 * Build the MI scheduling topology from the discovered hardware topology.
790 */
791struct cpu_group *
792cpu_topo(void)
793{
794	struct cpu_group *cg_root;
795
796	if (mp_ncpus <= 1)
797		return (smp_topo_none());
798
799	cg_root = smp_topo_alloc(1);
800	x86topo_add_sched_group(&topo_root, cg_root);
801	return (cg_root);
802}
803
804
805/*
806 * Add a logical CPU to the topology.
807 */
808void
809cpu_add(u_int apic_id, char boot_cpu)
810{
811
812	if (apic_id > MAX_APIC_ID) {
813		panic("SMP: APIC ID %d too high", apic_id);
814		return;
815	}
816	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
817	    apic_id));
818	cpu_info[apic_id].cpu_present = 1;
819	if (boot_cpu) {
820		KASSERT(boot_cpu_id == -1,
821		    ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
822		    boot_cpu_id));
823		boot_cpu_id = apic_id;
824		cpu_info[apic_id].cpu_bsp = 1;
825	}
826	if (mp_ncpus < MAXCPU) {
827		mp_ncpus++;
828		mp_maxid = mp_ncpus - 1;
829	}
830	if (bootverbose)
831		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
832		    "AP");
833}
834
835void
836cpu_mp_setmaxid(void)
837{
838
839	/*
840	 * mp_ncpus and mp_maxid should be already set by calls to cpu_add().
841	 * If there were no calls to cpu_add() assume this is a UP system.
842	 */
843	if (mp_ncpus == 0)
844		mp_ncpus = 1;
845}
846
847int
848cpu_mp_probe(void)
849{
850
851	/*
852	 * Always record BSP in CPU map so that the mbuf init code works
853	 * correctly.
854	 */
855	CPU_SETOF(0, &all_cpus);
856	return (mp_ncpus > 1);
857}
858
859/*
860 * AP CPU's call this to initialize themselves.
861 */
862void
863init_secondary_tail(void)
864{
865	u_int cpuid;
866
867	/*
868	 * On real hardware, switch to x2apic mode if possible.  Do it
869	 * after aps_ready was signalled, to avoid manipulating the
870	 * mode while BSP might still want to send some IPI to us
871	 * (second startup IPI is ignored on modern hardware etc).
872	 */
873	lapic_xapic_mode();
874
875	/* Initialize the PAT MSR. */
876	pmap_init_pat();
877
878	/* set up CPU registers and state */
879	cpu_setregs();
880
881	/* set up SSE/NX */
882	initializecpu();
883
884	/* set up FPU state on the AP */
885#ifdef __amd64__
886	fpuinit();
887#else
888	npxinit(false);
889#endif
890
891	if (cpu_ops.cpu_init)
892		cpu_ops.cpu_init();
893
894	/* A quick check from sanity claus */
895	cpuid = PCPU_GET(cpuid);
896	if (PCPU_GET(apic_id) != lapic_id()) {
897		printf("SMP: cpuid = %d\n", cpuid);
898		printf("SMP: actual apic_id = %d\n", lapic_id());
899		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
900		panic("cpuid mismatch! boom!!");
901	}
902
903	/* Initialize curthread. */
904	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
905	PCPU_SET(curthread, PCPU_GET(idlethread));
906
907	mtx_lock_spin(&ap_boot_mtx);
908
909	mca_init();
910
911	/* Init local apic for irq's */
912	lapic_setup(1);
913
914	/* Set memory range attributes for this CPU to match the BSP */
915	mem_range_AP_init();
916
917	smp_cpus++;
918
919	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid);
920	printf("SMP: AP CPU #%d Launched!\n", cpuid);
921
922	/* Determine if we are a logical CPU. */
923	if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread)
924		CPU_SET(cpuid, &logical_cpus_mask);
925
926	if (bootverbose)
927		lapic_dump("AP");
928
929	if (smp_cpus == mp_ncpus) {
930		/* enable IPI's, tlb shootdown, freezes etc */
931		atomic_store_rel_int(&smp_started, 1);
932	}
933
934#ifdef __amd64__
935	/*
936	 * Enable global pages TLB extension
937	 * This also implicitly flushes the TLB
938	 */
939	load_cr4(rcr4() | CR4_PGE);
940	if (pmap_pcid_enabled)
941		load_cr4(rcr4() | CR4_PCIDE);
942	load_ds(_udatasel);
943	load_es(_udatasel);
944	load_fs(_ufssel);
945#endif
946
947	mtx_unlock_spin(&ap_boot_mtx);
948
949	/* Wait until all the AP's are up. */
950	while (atomic_load_acq_int(&smp_started) == 0)
951		ia32_pause();
952
953#ifndef EARLY_AP_STARTUP
954	/* Start per-CPU event timers. */
955	cpu_initclocks_ap();
956#endif
957
958	sched_throw(NULL);
959
960	panic("scheduler returned us to %s", __func__);
961	/* NOTREACHED */
962}
963
964/*******************************************************************
965 * local functions and data
966 */
967
968/*
969 * We tell the I/O APIC code about all the CPUs we want to receive
970 * interrupts.  If we don't want certain CPUs to receive IRQs we
971 * can simply not tell the I/O APIC code about them in this function.
972 * We also do not tell it about the BSP since it tells itself about
973 * the BSP internally to work with UP kernels and on UP machines.
974 */
975void
976set_interrupt_apic_ids(void)
977{
978	u_int i, apic_id;
979
980	for (i = 0; i < MAXCPU; i++) {
981		apic_id = cpu_apic_ids[i];
982		if (apic_id == -1)
983			continue;
984		if (cpu_info[apic_id].cpu_bsp)
985			continue;
986		if (cpu_info[apic_id].cpu_disabled)
987			continue;
988
989		/* Don't let hyperthreads service interrupts. */
990		if (cpu_info[apic_id].cpu_hyperthread)
991			continue;
992
993		intr_add_cpu(i);
994	}
995}
996
997
998#ifdef COUNT_XINVLTLB_HITS
999u_int xhits_gbl[MAXCPU];
1000u_int xhits_pg[MAXCPU];
1001u_int xhits_rng[MAXCPU];
1002static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
1003SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
1004    sizeof(xhits_gbl), "IU", "");
1005SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
1006    sizeof(xhits_pg), "IU", "");
1007SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
1008    sizeof(xhits_rng), "IU", "");
1009
1010u_int ipi_global;
1011u_int ipi_page;
1012u_int ipi_range;
1013u_int ipi_range_size;
1014SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
1015SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
1016SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
1017SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
1018    0, "");
1019#endif /* COUNT_XINVLTLB_HITS */
1020
1021/*
1022 * Init and startup IPI.
1023 */
1024void
1025ipi_startup(int apic_id, int vector)
1026{
1027
1028	/*
1029	 * This attempts to follow the algorithm described in the
1030	 * Intel Multiprocessor Specification v1.4 in section B.4.
1031	 * For each IPI, we allow the local APIC ~20us to deliver the
1032	 * IPI.  If that times out, we panic.
1033	 */
1034
1035	/*
1036	 * first we do an INIT IPI: this INIT IPI might be run, resetting
1037	 * and running the target CPU. OR this INIT IPI might be latched (P5
1038	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
1039	 * ignored.
1040	 */
1041	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
1042	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
1043	lapic_ipi_wait(100);
1044
1045	/* Explicitly deassert the INIT IPI. */
1046	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
1047	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT,
1048	    apic_id);
1049
1050	DELAY(10000);		/* wait ~10mS */
1051
1052	/*
1053	 * next we do a STARTUP IPI: the previous INIT IPI might still be
1054	 * latched, (P5 bug) this 1st STARTUP would then terminate
1055	 * immediately, and the previously started INIT IPI would continue. OR
1056	 * the previous INIT IPI has already run. and this STARTUP IPI will
1057	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
1058	 * will run.
1059	 */
1060	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
1061	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
1062	    vector, apic_id);
1063	if (!lapic_ipi_wait(100))
1064		panic("Failed to deliver first STARTUP IPI to APIC %d",
1065		    apic_id);
1066	DELAY(200);		/* wait ~200uS */
1067
1068	/*
1069	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
1070	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
1071	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
1072	 * recognized after hardware RESET or INIT IPI.
1073	 */
1074	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
1075	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
1076	    vector, apic_id);
1077	if (!lapic_ipi_wait(100))
1078		panic("Failed to deliver second STARTUP IPI to APIC %d",
1079		    apic_id);
1080
1081	DELAY(200);		/* wait ~200uS */
1082}
1083
1084/*
1085 * Send an IPI to specified CPU handling the bitmap logic.
1086 */
1087void
1088ipi_send_cpu(int cpu, u_int ipi)
1089{
1090	u_int bitmap, old_pending, new_pending;
1091
1092	KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu));
1093
1094	if (IPI_IS_BITMAPED(ipi)) {
1095		bitmap = 1 << ipi;
1096		ipi = IPI_BITMAP_VECTOR;
1097		do {
1098			old_pending = cpu_ipi_pending[cpu];
1099			new_pending = old_pending | bitmap;
1100		} while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],
1101		    old_pending, new_pending));
1102		if (old_pending)
1103			return;
1104	}
1105	lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
1106}
1107
1108void
1109ipi_bitmap_handler(struct trapframe frame)
1110{
1111	struct trapframe *oldframe;
1112	struct thread *td;
1113	int cpu = PCPU_GET(cpuid);
1114	u_int ipi_bitmap;
1115
1116	critical_enter();
1117	td = curthread;
1118	td->td_intr_nesting_level++;
1119	oldframe = td->td_intr_frame;
1120	td->td_intr_frame = &frame;
1121	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
1122	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
1123#ifdef COUNT_IPIS
1124		(*ipi_preempt_counts[cpu])++;
1125#endif
1126		sched_preempt(td);
1127	}
1128	if (ipi_bitmap & (1 << IPI_AST)) {
1129#ifdef COUNT_IPIS
1130		(*ipi_ast_counts[cpu])++;
1131#endif
1132		/* Nothing to do for AST */
1133	}
1134	if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
1135#ifdef COUNT_IPIS
1136		(*ipi_hardclock_counts[cpu])++;
1137#endif
1138		hardclockintr();
1139	}
1140	td->td_intr_frame = oldframe;
1141	td->td_intr_nesting_level--;
1142	critical_exit();
1143}
1144
1145/*
1146 * send an IPI to a set of cpus.
1147 */
1148void
1149ipi_selected(cpuset_t cpus, u_int ipi)
1150{
1151	int cpu;
1152
1153	/*
1154	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1155	 * of help in order to understand what is the source.
1156	 * Set the mask of receiving CPUs for this purpose.
1157	 */
1158	if (ipi == IPI_STOP_HARD)
1159		CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus);
1160
1161	while ((cpu = CPU_FFS(&cpus)) != 0) {
1162		cpu--;
1163		CPU_CLR(cpu, &cpus);
1164		CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1165		ipi_send_cpu(cpu, ipi);
1166	}
1167}
1168
1169/*
1170 * send an IPI to a specific CPU.
1171 */
1172void
1173ipi_cpu(int cpu, u_int ipi)
1174{
1175
1176	/*
1177	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1178	 * of help in order to understand what is the source.
1179	 * Set the mask of receiving CPUs for this purpose.
1180	 */
1181	if (ipi == IPI_STOP_HARD)
1182		CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending);
1183
1184	CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1185	ipi_send_cpu(cpu, ipi);
1186}
1187
1188/*
1189 * send an IPI to all CPUs EXCEPT myself
1190 */
1191void
1192ipi_all_but_self(u_int ipi)
1193{
1194	cpuset_t other_cpus;
1195
1196	other_cpus = all_cpus;
1197	CPU_CLR(PCPU_GET(cpuid), &other_cpus);
1198	if (IPI_IS_BITMAPED(ipi)) {
1199		ipi_selected(other_cpus, ipi);
1200		return;
1201	}
1202
1203	/*
1204	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1205	 * of help in order to understand what is the source.
1206	 * Set the mask of receiving CPUs for this purpose.
1207	 */
1208	if (ipi == IPI_STOP_HARD)
1209		CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus);
1210
1211	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1212	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
1213}
1214
1215int
1216ipi_nmi_handler(void)
1217{
1218	u_int cpuid;
1219
1220	/*
1221	 * As long as there is not a simple way to know about a NMI's
1222	 * source, if the bitmask for the current CPU is present in
1223	 * the global pending bitword an IPI_STOP_HARD has been issued
1224	 * and should be handled.
1225	 */
1226	cpuid = PCPU_GET(cpuid);
1227	if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending))
1228		return (1);
1229
1230	CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending);
1231	cpustop_handler();
1232	return (0);
1233}
1234
1235int nmi_kdb_lock;
1236
1237void
1238nmi_call_kdb_smp(u_int type, struct trapframe *frame)
1239{
1240	int cpu;
1241	bool call_post;
1242
1243	cpu = PCPU_GET(cpuid);
1244	if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) {
1245		nmi_call_kdb(cpu, type, frame);
1246		call_post = false;
1247	} else {
1248		savectx(&stoppcbs[cpu]);
1249		CPU_SET_ATOMIC(cpu, &stopped_cpus);
1250		while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1))
1251			ia32_pause();
1252		call_post = true;
1253	}
1254	atomic_store_rel_int(&nmi_kdb_lock, 0);
1255	if (call_post)
1256		cpustop_handler_post(cpu);
1257}
1258
1259/*
1260 * Handle an IPI_STOP by saving our current context and spinning until we
1261 * are resumed.
1262 */
1263void
1264cpustop_handler(void)
1265{
1266	u_int cpu;
1267
1268	cpu = PCPU_GET(cpuid);
1269
1270	savectx(&stoppcbs[cpu]);
1271
1272	/* Indicate that we are stopped */
1273	CPU_SET_ATOMIC(cpu, &stopped_cpus);
1274
1275	/* Wait for restart */
1276	while (!CPU_ISSET(cpu, &started_cpus))
1277	    ia32_pause();
1278
1279	cpustop_handler_post(cpu);
1280}
1281
1282static void
1283cpustop_handler_post(u_int cpu)
1284{
1285
1286	CPU_CLR_ATOMIC(cpu, &started_cpus);
1287	CPU_CLR_ATOMIC(cpu, &stopped_cpus);
1288
1289#if defined(__amd64__) && defined(DDB)
1290	amd64_db_resume_dbreg();
1291#endif
1292
1293	if (cpu == 0 && cpustop_restartfunc != NULL) {
1294		cpustop_restartfunc();
1295		cpustop_restartfunc = NULL;
1296	}
1297}
1298
1299/*
1300 * Handle an IPI_SUSPEND by saving our current context and spinning until we
1301 * are resumed.
1302 */
1303void
1304cpususpend_handler(void)
1305{
1306	u_int cpu;
1307
1308	mtx_assert(&smp_ipi_mtx, MA_NOTOWNED);
1309
1310	cpu = PCPU_GET(cpuid);
1311	if (savectx(&susppcbs[cpu]->sp_pcb)) {
1312#ifdef __amd64__
1313		fpususpend(susppcbs[cpu]->sp_fpususpend);
1314#else
1315		npxsuspend(susppcbs[cpu]->sp_fpususpend);
1316#endif
1317		/*
1318		 * suspended_cpus is cleared shortly after each AP is restarted
1319		 * by a Startup IPI, so that the BSP can proceed to restarting
1320		 * the next AP.
1321		 *
1322		 * resuming_cpus gets cleared when the AP completes
1323		 * initialization after having been released by the BSP.
1324		 * resuming_cpus is probably not the best name for the
1325		 * variable, because it is actually a set of processors that
1326		 * haven't resumed yet and haven't necessarily started resuming.
1327		 *
1328		 * Note that suspended_cpus is meaningful only for ACPI suspend
1329		 * as it's not really used for Xen suspend since the APs are
1330		 * automatically restored to the running state and the correct
1331		 * context.  For the same reason resumectx is never called in
1332		 * that case.
1333		 */
1334		CPU_SET_ATOMIC(cpu, &suspended_cpus);
1335		CPU_SET_ATOMIC(cpu, &resuming_cpus);
1336
1337		/*
1338		 * Invalidate the cache after setting the global status bits.
1339		 * The last AP to set its bit may end up being an Owner of the
1340		 * corresponding cache line in MOESI protocol.  The AP may be
1341		 * stopped before the cache line is written to the main memory.
1342		 */
1343		wbinvd();
1344	} else {
1345#ifdef __amd64__
1346		fpuresume(susppcbs[cpu]->sp_fpususpend);
1347#else
1348		npxresume(susppcbs[cpu]->sp_fpususpend);
1349#endif
1350		pmap_init_pat();
1351		initializecpu();
1352		PCPU_SET(switchtime, 0);
1353		PCPU_SET(switchticks, ticks);
1354
1355		/* Indicate that we have restarted and restored the context. */
1356		CPU_CLR_ATOMIC(cpu, &suspended_cpus);
1357	}
1358
1359	/* Wait for resume directive */
1360	while (!CPU_ISSET(cpu, &toresume_cpus))
1361		ia32_pause();
1362
1363	if (cpu_ops.cpu_resume)
1364		cpu_ops.cpu_resume();
1365#ifdef __amd64__
1366	if (vmm_resume_p)
1367		vmm_resume_p();
1368#endif
1369
1370	/* Resume MCA and local APIC */
1371	lapic_xapic_mode();
1372	mca_resume();
1373	lapic_setup(0);
1374
1375	/* Indicate that we are resumed */
1376	CPU_CLR_ATOMIC(cpu, &resuming_cpus);
1377	CPU_CLR_ATOMIC(cpu, &suspended_cpus);
1378	CPU_CLR_ATOMIC(cpu, &toresume_cpus);
1379}
1380
1381
1382void
1383invlcache_handler(void)
1384{
1385	uint32_t generation;
1386
1387#ifdef COUNT_IPIS
1388	(*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
1389#endif /* COUNT_IPIS */
1390
1391	/*
1392	 * Reading the generation here allows greater parallelism
1393	 * since wbinvd is a serializing instruction.  Without the
1394	 * temporary, we'd wait for wbinvd to complete, then the read
1395	 * would execute, then the dependent write, which must then
1396	 * complete before return from interrupt.
1397	 */
1398	generation = smp_tlb_generation;
1399	wbinvd();
1400	PCPU_SET(smp_tlb_done, generation);
1401}
1402
1403/*
1404 * This is called once the rest of the system is up and running and we're
1405 * ready to let the AP's out of the pen.
1406 */
1407static void
1408release_aps(void *dummy __unused)
1409{
1410
1411	if (mp_ncpus == 1)
1412		return;
1413	atomic_store_rel_int(&aps_ready, 1);
1414	while (smp_started == 0)
1415		ia32_pause();
1416}
1417SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1418
1419#ifdef COUNT_IPIS
1420/*
1421 * Setup interrupt counters for IPI handlers.
1422 */
1423static void
1424mp_ipi_intrcnt(void *dummy)
1425{
1426	char buf[64];
1427	int i;
1428
1429	CPU_FOREACH(i) {
1430		snprintf(buf, sizeof(buf), "cpu%d:invltlb", i);
1431		intrcnt_add(buf, &ipi_invltlb_counts[i]);
1432		snprintf(buf, sizeof(buf), "cpu%d:invlrng", i);
1433		intrcnt_add(buf, &ipi_invlrng_counts[i]);
1434		snprintf(buf, sizeof(buf), "cpu%d:invlpg", i);
1435		intrcnt_add(buf, &ipi_invlpg_counts[i]);
1436		snprintf(buf, sizeof(buf), "cpu%d:invlcache", i);
1437		intrcnt_add(buf, &ipi_invlcache_counts[i]);
1438		snprintf(buf, sizeof(buf), "cpu%d:preempt", i);
1439		intrcnt_add(buf, &ipi_preempt_counts[i]);
1440		snprintf(buf, sizeof(buf), "cpu%d:ast", i);
1441		intrcnt_add(buf, &ipi_ast_counts[i]);
1442		snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i);
1443		intrcnt_add(buf, &ipi_rendezvous_counts[i]);
1444		snprintf(buf, sizeof(buf), "cpu%d:hardclock", i);
1445		intrcnt_add(buf, &ipi_hardclock_counts[i]);
1446	}
1447}
1448SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
1449#endif
1450
1451/*
1452 * Flush the TLB on other CPU's
1453 */
1454
1455/* Variables needed for SMP tlb shootdown. */
1456vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
1457pmap_t smp_tlb_pmap;
1458volatile uint32_t smp_tlb_generation;
1459
1460#ifdef __amd64__
1461#define	read_eflags() read_rflags()
1462#endif
1463
1464static void
1465smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
1466    vm_offset_t addr1, vm_offset_t addr2)
1467{
1468	cpuset_t other_cpus;
1469	volatile uint32_t *p_cpudone;
1470	uint32_t generation;
1471	int cpu;
1472
1473	/*
1474	 * Check for other cpus.  Return if none.
1475	 */
1476	if (CPU_ISFULLSET(&mask)) {
1477		if (mp_ncpus <= 1)
1478			return;
1479	} else {
1480		CPU_CLR(PCPU_GET(cpuid), &mask);
1481		if (CPU_EMPTY(&mask))
1482			return;
1483	}
1484
1485	if (!(read_eflags() & PSL_I))
1486		panic("%s: interrupts disabled", __func__);
1487	mtx_lock_spin(&smp_ipi_mtx);
1488	smp_tlb_addr1 = addr1;
1489	smp_tlb_addr2 = addr2;
1490	smp_tlb_pmap = pmap;
1491	generation = ++smp_tlb_generation;
1492	if (CPU_ISFULLSET(&mask)) {
1493		ipi_all_but_self(vector);
1494		other_cpus = all_cpus;
1495		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
1496	} else {
1497		other_cpus = mask;
1498		while ((cpu = CPU_FFS(&mask)) != 0) {
1499			cpu--;
1500			CPU_CLR(cpu, &mask);
1501			CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
1502			    cpu, vector);
1503			ipi_send_cpu(cpu, vector);
1504		}
1505	}
1506	while ((cpu = CPU_FFS(&other_cpus)) != 0) {
1507		cpu--;
1508		CPU_CLR(cpu, &other_cpus);
1509		p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done;
1510		while (*p_cpudone != generation)
1511			ia32_pause();
1512	}
1513	mtx_unlock_spin(&smp_ipi_mtx);
1514}
1515
1516void
1517smp_masked_invltlb(cpuset_t mask, pmap_t pmap)
1518{
1519
1520	if (smp_started) {
1521		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0);
1522#ifdef COUNT_XINVLTLB_HITS
1523		ipi_global++;
1524#endif
1525	}
1526}
1527
1528void
1529smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap)
1530{
1531
1532	if (smp_started) {
1533		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0);
1534#ifdef COUNT_XINVLTLB_HITS
1535		ipi_page++;
1536#endif
1537	}
1538}
1539
1540void
1541smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2,
1542    pmap_t pmap)
1543{
1544
1545	if (smp_started) {
1546		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap,
1547		    addr1, addr2);
1548#ifdef COUNT_XINVLTLB_HITS
1549		ipi_range++;
1550		ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
1551#endif
1552	}
1553}
1554
1555void
1556smp_cache_flush(void)
1557{
1558
1559	if (smp_started) {
1560		smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL,
1561		    0, 0);
1562	}
1563}
1564
1565/*
1566 * Handlers for TLB related IPIs
1567 */
1568void
1569invltlb_handler(void)
1570{
1571	uint32_t generation;
1572
1573#ifdef COUNT_XINVLTLB_HITS
1574	xhits_gbl[PCPU_GET(cpuid)]++;
1575#endif /* COUNT_XINVLTLB_HITS */
1576#ifdef COUNT_IPIS
1577	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
1578#endif /* COUNT_IPIS */
1579
1580	/*
1581	 * Reading the generation here allows greater parallelism
1582	 * since invalidating the TLB is a serializing operation.
1583	 */
1584	generation = smp_tlb_generation;
1585	if (smp_tlb_pmap == kernel_pmap)
1586		invltlb_glob();
1587	else
1588		invltlb();
1589	PCPU_SET(smp_tlb_done, generation);
1590}
1591
1592void
1593invlpg_handler(void)
1594{
1595	uint32_t generation;
1596
1597#ifdef COUNT_XINVLTLB_HITS
1598	xhits_pg[PCPU_GET(cpuid)]++;
1599#endif /* COUNT_XINVLTLB_HITS */
1600#ifdef COUNT_IPIS
1601	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
1602#endif /* COUNT_IPIS */
1603
1604	generation = smp_tlb_generation;	/* Overlap with serialization */
1605	invlpg(smp_tlb_addr1);
1606	PCPU_SET(smp_tlb_done, generation);
1607}
1608
1609void
1610invlrng_handler(void)
1611{
1612	vm_offset_t addr, addr2;
1613	uint32_t generation;
1614
1615#ifdef COUNT_XINVLTLB_HITS
1616	xhits_rng[PCPU_GET(cpuid)]++;
1617#endif /* COUNT_XINVLTLB_HITS */
1618#ifdef COUNT_IPIS
1619	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
1620#endif /* COUNT_IPIS */
1621
1622	addr = smp_tlb_addr1;
1623	addr2 = smp_tlb_addr2;
1624	generation = smp_tlb_generation;	/* Overlap with serialization */
1625	do {
1626		invlpg(addr);
1627		addr += PAGE_SIZE;
1628	} while (addr < addr2);
1629
1630	PCPU_SET(smp_tlb_done, generation);
1631}
1632