mp_x86.c revision 297577
1/*-
2 * Copyright (c) 1996, by Steve Passe
3 * Copyright (c) 2003, by Peter Wemm
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. The name of the developer may NOT be used to endorse or promote products
12 *    derived from this software without specific prior written permission.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/x86/x86/mp_x86.c 297577 2016-04-05 10:36:40Z avg $");
29
30#ifdef __i386__
31#include "opt_apic.h"
32#endif
33#include "opt_cpu.h"
34#include "opt_kstack_pages.h"
35#include "opt_pmap.h"
36#include "opt_sched.h"
37#include "opt_smp.h"
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/bus.h>
42#include <sys/cons.h>	/* cngetc() */
43#include <sys/cpuset.h>
44#ifdef GPROF
45#include <sys/gmon.h>
46#endif
47#include <sys/kernel.h>
48#include <sys/ktr.h>
49#include <sys/lock.h>
50#include <sys/malloc.h>
51#include <sys/memrange.h>
52#include <sys/mutex.h>
53#include <sys/pcpu.h>
54#include <sys/proc.h>
55#include <sys/sched.h>
56#include <sys/smp.h>
57#include <sys/sysctl.h>
58
59#include <vm/vm.h>
60#include <vm/vm_param.h>
61#include <vm/pmap.h>
62#include <vm/vm_kern.h>
63#include <vm/vm_extern.h>
64
65#include <x86/apicreg.h>
66#include <machine/clock.h>
67#include <machine/cputypes.h>
68#include <x86/mca.h>
69#include <machine/md_var.h>
70#include <machine/pcb.h>
71#include <machine/psl.h>
72#include <machine/smp.h>
73#include <machine/specialreg.h>
74#include <machine/cpu.h>
75
76#define WARMBOOT_TARGET		0
77#define WARMBOOT_OFF		(KERNBASE + 0x0467)
78#define WARMBOOT_SEG		(KERNBASE + 0x0469)
79
80#define CMOS_REG		(0x70)
81#define CMOS_DATA		(0x71)
82#define BIOS_RESET		(0x0f)
83#define BIOS_WARM		(0x0a)
84
85/* lock region used by kernel profiling */
86int	mcount_lock;
87
88int	mp_naps;		/* # of Applications processors */
89int	boot_cpu_id = -1;	/* designated BSP */
90
91extern	struct pcpu __pcpu[];
92
93/* AP uses this during bootstrap.  Do not staticize.  */
94char *bootSTK;
95int bootAP;
96
97/* Free these after use */
98void *bootstacks[MAXCPU];
99void *dpcpu;
100
101struct pcb stoppcbs[MAXCPU];
102struct susppcb **susppcbs;
103
104#ifdef COUNT_IPIS
105/* Interrupt counts. */
106static u_long *ipi_preempt_counts[MAXCPU];
107static u_long *ipi_ast_counts[MAXCPU];
108u_long *ipi_invltlb_counts[MAXCPU];
109u_long *ipi_invlrng_counts[MAXCPU];
110u_long *ipi_invlpg_counts[MAXCPU];
111u_long *ipi_invlcache_counts[MAXCPU];
112u_long *ipi_rendezvous_counts[MAXCPU];
113static u_long *ipi_hardclock_counts[MAXCPU];
114#endif
115
116/* Default cpu_ops implementation. */
117struct cpu_ops cpu_ops;
118
119/*
120 * Local data and functions.
121 */
122
123static volatile cpuset_t ipi_stop_nmi_pending;
124
125/* used to hold the AP's until we are ready to release them */
126struct mtx ap_boot_mtx;
127
128/* Set to 1 once we're ready to let the APs out of the pen. */
129volatile int aps_ready = 0;
130
131/*
132 * Store data from cpu_add() until later in the boot when we actually setup
133 * the APs.
134 */
135struct cpu_info cpu_info[MAX_APIC_ID + 1];
136int apic_cpuids[MAX_APIC_ID + 1];
137int cpu_apic_ids[MAXCPU];
138
139/* Holds pending bitmap based IPIs per CPU */
140volatile u_int cpu_ipi_pending[MAXCPU];
141
142static void	release_aps(void *dummy);
143
144static int	hyperthreading_allowed = 1;
145SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN,
146	&hyperthreading_allowed, 0, "Use Intel HTT logical CPUs");
147
148static struct topo_node topo_root;
149
150static int pkg_id_shift;
151static int core_id_shift;
152static int disabled_cpus;
153
154struct cache_info {
155	int	id_shift;
156	int	present;
157} static caches[MAX_CACHE_LEVELS];
158
159void
160mem_range_AP_init(void)
161{
162
163	if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
164		mem_range_softc.mr_op->initAP(&mem_range_softc);
165}
166
167/*
168 * Round up to the next power of two, if necessary, and then
169 * take log2.
170 * Returns -1 if argument is zero.
171 */
172static __inline int
173mask_width(u_int x)
174{
175
176	return (fls(x << (1 - powerof2(x))) - 1);
177}
178
179/*
180 * Add a cache level to the cache topology description.
181 */
182static int
183add_deterministic_cache(int type, int level, int share_count)
184{
185
186	if (type == 0)
187		return (0);
188	if (type > 3) {
189		printf("unexpected cache type %d\n", type);
190		return (1);
191	}
192	if (type == 2) /* ignore instruction cache */
193		return (1);
194	if (level == 0 || level > MAX_CACHE_LEVELS) {
195		printf("unexpected cache level %d\n", type);
196		return (1);
197	}
198
199	if (caches[level - 1].present) {
200		printf("WARNING: multiple entries for L%u data cache\n", level);
201		printf("%u => %u\n", caches[level - 1].id_shift,
202		    mask_width(share_count));
203	}
204	caches[level - 1].id_shift = mask_width(share_count);
205	caches[level - 1].present = 1;
206
207	if (caches[level - 1].id_shift > pkg_id_shift) {
208		printf("WARNING: L%u data cache covers more "
209		    "APIC IDs than a package\n", level);
210		printf("%u > %u\n", caches[level - 1].id_shift, pkg_id_shift);
211		caches[level - 1].id_shift = pkg_id_shift;
212	}
213	if (caches[level - 1].id_shift < core_id_shift) {
214		printf("WARNING: L%u data cache covers less "
215		    "APIC IDs than a core\n", level);
216		printf("%u < %u\n", caches[level - 1].id_shift, core_id_shift);
217		caches[level - 1].id_shift = core_id_shift;
218	}
219
220	return (1);
221}
222
223/*
224 * Determine topology of processing units and caches for AMD CPUs.
225 * See:
226 *  - AMD CPUID Specification (Publication # 25481)
227 *  - BKDG For AMD Family 10h Processors (Publication # 31116), section 2.15
228 *  - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559)
229 * XXX At the moment the code does not recognize grouping of AMD CMT threads,
230 * if supported, into cores, so each thread is treated as being in its own
231 * core.  In other words, each logical CPU is considered to be a core.
232 */
233static void
234topo_probe_amd(void)
235{
236	u_int p[4];
237	int level;
238	int share_count;
239	int type;
240	int i;
241
242	/* No multi-core capability. */
243	if ((amd_feature2 & AMDID2_CMP) == 0)
244		return;
245
246	/* For families 10h and newer. */
247	pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
248	    AMDID_COREID_SIZE_SHIFT;
249
250	/* For 0Fh family. */
251	if (pkg_id_shift == 0)
252		pkg_id_shift =
253		    mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1);
254
255	if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) {
256		for (i = 0; ; i++) {
257			cpuid_count(0x8000001d, i, p);
258			type = p[0] & 0x1f;
259			level = (p[0] >> 5) & 0x7;
260			share_count = 1 + ((p[0] >> 14) & 0xfff);
261
262			if (!add_deterministic_cache(type, level, share_count))
263				break;
264		}
265	} else {
266		if (cpu_exthigh >= 0x80000005) {
267			cpuid_count(0x80000005, 0, p);
268			if (((p[2] >> 24) & 0xff) != 0) {
269				caches[0].id_shift = 0;
270				caches[0].present = 1;
271			}
272		}
273		if (cpu_exthigh >= 0x80000006) {
274			cpuid_count(0x80000006, 0, p);
275			if (((p[2] >> 16) & 0xffff) != 0) {
276				caches[1].id_shift = 0;
277				caches[1].present = 1;
278			}
279			if (((p[3] >> 18) & 0x3fff) != 0) {
280
281				/*
282				 * TODO: Account for dual-node processors
283				 * where each node within a package has its own
284				 * L3 cache.
285				 */
286				caches[2].id_shift = pkg_id_shift;
287				caches[2].present = 1;
288			}
289		}
290	}
291}
292
293/*
294 * Determine topology of processing units for Intel CPUs
295 * using CPUID Leaf 1 and Leaf 4, if supported.
296 * See:
297 *  - Intel 64 Architecture Processor Topology Enumeration
298 *  - Intel 64 and IA-32 ArchitecturesSoftware Developer���s Manual,
299 *    Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
300 *    FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
301 */
302static void
303topo_probe_intel_0x4(void)
304{
305	u_int p[4];
306	int max_cores;
307	int max_logical;
308
309	/* Both zero and one here mean one logical processor per package. */
310	max_logical = (cpu_feature & CPUID_HTT) != 0 ?
311	    (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
312	if (max_logical <= 1)
313		return;
314
315	if (cpu_high >= 0x4) {
316		cpuid_count(0x04, 0, p);
317		max_cores = ((p[0] >> 26) & 0x3f) + 1;
318	} else
319		max_cores = 1;
320
321	core_id_shift = mask_width(max_logical/max_cores);
322	KASSERT(core_id_shift >= 0,
323	    ("intel topo: max_cores > max_logical\n"));
324	pkg_id_shift = core_id_shift + mask_width(max_cores);
325}
326
327/*
328 * Determine topology of processing units for Intel CPUs
329 * using CPUID Leaf 11, if supported.
330 * See:
331 *  - Intel 64 Architecture Processor Topology Enumeration
332 *  - Intel 64 and IA-32 ArchitecturesSoftware Developer���s Manual,
333 *    Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
334 *    FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
335 */
336static void
337topo_probe_intel_0xb(void)
338{
339	u_int p[4];
340	int bits;
341	int type;
342	int i;
343
344	/* Fall back if CPU leaf 11 doesn't really exist. */
345	cpuid_count(0x0b, 0, p);
346	if (p[1] == 0) {
347		topo_probe_intel_0x4();
348		return;
349	}
350
351	/* We only support three levels for now. */
352	for (i = 0; ; i++) {
353		cpuid_count(0x0b, i, p);
354
355		bits = p[0] & 0x1f;
356		type = (p[2] >> 8) & 0xff;
357
358		if (type == 0)
359			break;
360
361		/* TODO: check for duplicate (re-)assignment */
362		if (type == CPUID_TYPE_SMT)
363			core_id_shift = bits;
364		else if (type == CPUID_TYPE_CORE)
365			pkg_id_shift = bits;
366		else
367			printf("unknown CPU level type %d\n", type);
368	}
369
370	if (pkg_id_shift < core_id_shift) {
371		printf("WARNING: core covers more APIC IDs than a package\n");
372		core_id_shift = pkg_id_shift;
373	}
374}
375
376/*
377 * Determine topology of caches for Intel CPUs.
378 * See:
379 *  - Intel 64 Architecture Processor Topology Enumeration
380 *  - Intel 64 and IA-32 Architectures Software Developer���s Manual
381 *    Volume 2A: Instruction Set Reference, A-M,
382 *    CPUID instruction
383 */
384static void
385topo_probe_intel_caches(void)
386{
387	u_int p[4];
388	int level;
389	int share_count;
390	int type;
391	int i;
392
393	if (cpu_high < 0x4) {
394		/*
395		 * Available cache level and sizes can be determined
396		 * via CPUID leaf 2, but that requires a huge table of hardcoded
397		 * values, so for now just assume L1 and L2 caches potentially
398		 * shared only by HTT processing units, if HTT is present.
399		 */
400		caches[0].id_shift = pkg_id_shift;
401		caches[0].present = 1;
402		caches[1].id_shift = pkg_id_shift;
403		caches[1].present = 1;
404		return;
405	}
406
407	for (i = 0; ; i++) {
408		cpuid_count(0x4, i, p);
409		type = p[0] & 0x1f;
410		level = (p[0] >> 5) & 0x7;
411		share_count = 1 + ((p[0] >> 14) & 0xfff);
412
413		if (!add_deterministic_cache(type, level, share_count))
414			break;
415	}
416}
417
418/*
419 * Determine topology of processing units and caches for Intel CPUs.
420 * See:
421 *  - Intel 64 Architecture Processor Topology Enumeration
422 */
423static void
424topo_probe_intel(void)
425{
426
427	/*
428	 * Note that 0x1 <= cpu_high < 4 case should be
429	 * compatible with topo_probe_intel_0x4() logic when
430	 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
431	 * or it should trigger the fallback otherwise.
432	 */
433	if (cpu_high >= 0xb)
434		topo_probe_intel_0xb();
435	else if (cpu_high >= 0x1)
436		topo_probe_intel_0x4();
437
438	topo_probe_intel_caches();
439}
440
441/*
442 * Topology information is queried only on BSP, on which this
443 * code runs and for which it can query CPUID information.
444 * Then topology is extrapolated on all packages using an
445 * assumption that APIC ID to hardware component ID mapping is
446 * homogenious.
447 * That doesn't necesserily imply that the topology is uniform.
448 */
449void
450topo_probe(void)
451{
452	static int cpu_topo_probed = 0;
453	struct x86_topo_layer {
454		int type;
455		int subtype;
456		int id_shift;
457	} topo_layers[MAX_CACHE_LEVELS + 3];
458	struct topo_node *parent;
459	struct topo_node *node;
460	int layer;
461	int nlayers;
462	int node_id;
463	int i;
464
465	if (cpu_topo_probed)
466		return;
467
468	CPU_ZERO(&logical_cpus_mask);
469
470	if (mp_ncpus <= 1)
471		; /* nothing */
472	else if (cpu_vendor_id == CPU_VENDOR_AMD)
473		topo_probe_amd();
474	else if (cpu_vendor_id == CPU_VENDOR_INTEL)
475		topo_probe_intel();
476
477	KASSERT(pkg_id_shift >= core_id_shift,
478	    ("bug in APIC topology discovery"));
479
480	nlayers = 0;
481	bzero(topo_layers, sizeof(topo_layers));
482
483	topo_layers[nlayers].type = TOPO_TYPE_PKG;
484	topo_layers[nlayers].id_shift = pkg_id_shift;
485	if (bootverbose)
486		printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift);
487	nlayers++;
488
489	/*
490	 * Consider all caches to be within a package/chip
491	 * and "in front" of all sub-components like
492	 * cores and hardware threads.
493	 */
494	for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) {
495		if (caches[i].present) {
496			KASSERT(caches[i].id_shift <= pkg_id_shift,
497				("bug in APIC topology discovery"));
498			KASSERT(caches[i].id_shift >= core_id_shift,
499				("bug in APIC topology discovery"));
500
501			topo_layers[nlayers].type = TOPO_TYPE_CACHE;
502			topo_layers[nlayers].subtype = i + 1;
503			topo_layers[nlayers].id_shift = caches[i].id_shift;
504			if (bootverbose)
505				printf("L%u cache ID shift: %u\n",
506				    topo_layers[nlayers].subtype,
507				    topo_layers[nlayers].id_shift);
508			nlayers++;
509		}
510	}
511
512	if (pkg_id_shift > core_id_shift) {
513		topo_layers[nlayers].type = TOPO_TYPE_CORE;
514		topo_layers[nlayers].id_shift = core_id_shift;
515		if (bootverbose)
516			printf("Core ID shift: %u\n",
517			    topo_layers[nlayers].id_shift);
518		nlayers++;
519	}
520
521	topo_layers[nlayers].type = TOPO_TYPE_PU;
522	topo_layers[nlayers].id_shift = 0;
523	nlayers++;
524
525	topo_init_root(&topo_root);
526	for (i = 0; i <= MAX_APIC_ID; ++i) {
527		if (!cpu_info[i].cpu_present)
528			continue;
529
530		parent = &topo_root;
531		for (layer = 0; layer < nlayers; ++layer) {
532			node_id = i >> topo_layers[layer].id_shift;
533			parent = topo_add_node_by_hwid(parent, node_id,
534			    topo_layers[layer].type,
535			    topo_layers[layer].subtype);
536		}
537	}
538
539	parent = &topo_root;
540	for (layer = 0; layer < nlayers; ++layer) {
541		node_id = boot_cpu_id >> topo_layers[layer].id_shift;
542		node = topo_find_node_by_hwid(parent, node_id,
543		    topo_layers[layer].type,
544		    topo_layers[layer].subtype);
545		topo_promote_child(node);
546		parent = node;
547	}
548
549	cpu_topo_probed = 1;
550}
551
552/*
553 * Assign logical CPU IDs to local APICs.
554 */
555void
556assign_cpu_ids(void)
557{
558	struct topo_node *node;
559	u_int smt_mask;
560
561	smt_mask = (1u << core_id_shift) - 1;
562
563	/*
564	 * Assign CPU IDs to local APIC IDs and disable any CPUs
565	 * beyond MAXCPU.  CPU 0 is always assigned to the BSP.
566	 */
567	mp_ncpus = 0;
568	TOPO_FOREACH(node, &topo_root) {
569		if (node->type != TOPO_TYPE_PU)
570			continue;
571
572		if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask))
573			cpu_info[node->hwid].cpu_hyperthread = 1;
574
575		if (resource_disabled("lapic", node->hwid)) {
576			if (node->hwid != boot_cpu_id)
577				cpu_info[node->hwid].cpu_disabled = 1;
578			else
579				printf("Cannot disable BSP, APIC ID = %d\n",
580				    node->hwid);
581		}
582
583		if (!hyperthreading_allowed &&
584		    cpu_info[node->hwid].cpu_hyperthread)
585			cpu_info[node->hwid].cpu_disabled = 1;
586
587		if (mp_ncpus >= MAXCPU)
588			cpu_info[node->hwid].cpu_disabled = 1;
589
590		if (cpu_info[node->hwid].cpu_disabled) {
591			disabled_cpus++;
592			continue;
593		}
594
595		cpu_apic_ids[mp_ncpus] = node->hwid;
596		apic_cpuids[node->hwid] = mp_ncpus;
597		topo_set_pu_id(node, mp_ncpus);
598		mp_ncpus++;
599	}
600
601	KASSERT(mp_maxid >= mp_ncpus - 1,
602	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
603	    mp_ncpus));
604}
605
606/*
607 * Print various information about the SMP system hardware and setup.
608 */
609void
610cpu_mp_announce(void)
611{
612	struct topo_node *node;
613	const char *hyperthread;
614	int pkg_count;
615	int cores_per_pkg;
616	int thrs_per_core;
617
618	printf("FreeBSD/SMP: ");
619	if (topo_analyze(&topo_root, 1, &pkg_count,
620	    &cores_per_pkg, &thrs_per_core)) {
621		printf("%d package(s)", pkg_count);
622		if (cores_per_pkg > 0)
623			printf(" x %d core(s)", cores_per_pkg);
624		if (thrs_per_core > 1)
625		    printf(" x %d hardware threads", thrs_per_core);
626	} else {
627		printf("Non-uniform topology");
628	}
629	printf("\n");
630
631	if (disabled_cpus) {
632		printf("FreeBSD/SMP Online: ");
633		if (topo_analyze(&topo_root, 0, &pkg_count,
634		    &cores_per_pkg, &thrs_per_core)) {
635			printf("%d package(s)", pkg_count);
636			if (cores_per_pkg > 0)
637				printf(" x %d core(s)", cores_per_pkg);
638			if (thrs_per_core > 1)
639			    printf(" x %d hardware threads", thrs_per_core);
640		} else {
641			printf("Non-uniform topology");
642		}
643		printf("\n");
644	}
645
646	if (!bootverbose)
647		return;
648
649	TOPO_FOREACH(node, &topo_root) {
650		switch (node->type) {
651		case TOPO_TYPE_PKG:
652			printf("Package HW ID = %u (%#x)\n",
653			    node->hwid, node->hwid);
654			break;
655		case TOPO_TYPE_CORE:
656			printf("\tCore HW ID = %u (%#x)\n",
657			    node->hwid, node->hwid);
658			break;
659		case TOPO_TYPE_PU:
660			if (cpu_info[node->hwid].cpu_hyperthread)
661				hyperthread = "/HT";
662			else
663				hyperthread = "";
664
665			if (node->subtype == 0)
666				printf("\t\tCPU (AP%s): APIC ID: %u (%#x)"
667				    "(disabled)\n", hyperthread, node->hwid,
668				    node->hwid);
669			else if (node->id == 0)
670				printf("\t\tCPU0 (BSP): APIC ID: %u (%#x)\n",
671				    node->hwid, node->hwid);
672			else
673				printf("\t\tCPU%u (AP%s): APIC ID: %u (%#x)\n",
674				    node->id, hyperthread, node->hwid,
675				    node->hwid);
676			break;
677		default:
678			/* ignored */
679			break;
680		}
681	}
682}
683
684/*
685 * Add a scheduling group, a group of logical processors sharing
686 * a particular cache (and, thus having an affinity), to the scheduling
687 * topology.
688 * This function recursively works on lower level caches.
689 */
690static void
691x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root)
692{
693	struct topo_node *node;
694	int nchildren;
695	int ncores;
696	int i;
697
698	KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE,
699	    ("x86topo_add_sched_group: bad type: %u", root->type));
700	CPU_COPY(&root->cpuset, &cg_root->cg_mask);
701	cg_root->cg_count = root->cpu_count;
702	if (root->type == TOPO_TYPE_SYSTEM)
703		cg_root->cg_level = CG_SHARE_NONE;
704	else
705		cg_root->cg_level = root->subtype;
706
707	/*
708	 * Check how many core nodes we have under the given root node.
709	 * If we have multiple logical processors, but not multiple
710	 * cores, then those processors must be hardware threads.
711	 */
712	ncores = 0;
713	node = root;
714	while (node != NULL) {
715		if (node->type != TOPO_TYPE_CORE) {
716			node = topo_next_node(root, node);
717			continue;
718		}
719
720		ncores++;
721		node = topo_next_nonchild_node(root, node);
722	}
723
724	if (cg_root->cg_level != CG_SHARE_NONE &&
725	    root->cpu_count > 1 && ncores < 2)
726		cg_root->cg_flags = CG_FLAG_SMT;
727
728	/*
729	 * Find out how many cache nodes we have under the given root node.
730	 * We ignore cache nodes that cover all the same processors as the
731	 * root node.  Also, we do not descend below found cache nodes.
732	 * That is, we count top-level "non-redundant" caches under the root
733	 * node.
734	 */
735	nchildren = 0;
736	node = root;
737	while (node != NULL) {
738		if (node->type != TOPO_TYPE_CACHE ||
739		    (root->type != TOPO_TYPE_SYSTEM &&
740		    CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
741			node = topo_next_node(root, node);
742			continue;
743		}
744		nchildren++;
745		node = topo_next_nonchild_node(root, node);
746	}
747
748	cg_root->cg_child = smp_topo_alloc(nchildren);
749	cg_root->cg_children = nchildren;
750
751	/*
752	 * Now find again the same cache nodes as above and recursively
753	 * build scheduling topologies for them.
754	 */
755	node = root;
756	i = 0;
757	while (node != NULL) {
758		if (node->type != TOPO_TYPE_CACHE ||
759		    (root->type != TOPO_TYPE_SYSTEM &&
760		    CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
761			node = topo_next_node(root, node);
762			continue;
763		}
764		cg_root->cg_child[i].cg_parent = cg_root;
765		x86topo_add_sched_group(node, &cg_root->cg_child[i]);
766		i++;
767		node = topo_next_nonchild_node(root, node);
768	}
769}
770
771/*
772 * Build the MI scheduling topology from the discovered hardware topology.
773 */
774struct cpu_group *
775cpu_topo(void)
776{
777	struct cpu_group *cg_root;
778
779	if (mp_ncpus <= 1)
780		return (smp_topo_none());
781
782	cg_root = smp_topo_alloc(1);
783	x86topo_add_sched_group(&topo_root, cg_root);
784	return (cg_root);
785}
786
787
788/*
789 * Add a logical CPU to the topology.
790 */
791void
792cpu_add(u_int apic_id, char boot_cpu)
793{
794
795	if (apic_id > MAX_APIC_ID) {
796		panic("SMP: APIC ID %d too high", apic_id);
797		return;
798	}
799	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
800	    apic_id));
801	cpu_info[apic_id].cpu_present = 1;
802	if (boot_cpu) {
803		KASSERT(boot_cpu_id == -1,
804		    ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
805		    boot_cpu_id));
806		boot_cpu_id = apic_id;
807		cpu_info[apic_id].cpu_bsp = 1;
808	}
809	if (mp_ncpus < MAXCPU) {
810		mp_ncpus++;
811		mp_maxid = mp_ncpus - 1;
812	}
813	if (bootverbose)
814		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
815		    "AP");
816}
817
818void
819cpu_mp_setmaxid(void)
820{
821
822	/*
823	 * mp_ncpus and mp_maxid should be already set by calls to cpu_add().
824	 * If there were no calls to cpu_add() assume this is a UP system.
825	 */
826	if (mp_ncpus == 0)
827		mp_ncpus = 1;
828}
829
830int
831cpu_mp_probe(void)
832{
833
834	/*
835	 * Always record BSP in CPU map so that the mbuf init code works
836	 * correctly.
837	 */
838	CPU_SETOF(0, &all_cpus);
839	return (mp_ncpus > 1);
840}
841
842/*
843 * AP CPU's call this to initialize themselves.
844 */
845void
846init_secondary_tail(void)
847{
848	u_int cpuid;
849
850	/*
851	 * On real hardware, switch to x2apic mode if possible.  Do it
852	 * after aps_ready was signalled, to avoid manipulating the
853	 * mode while BSP might still want to send some IPI to us
854	 * (second startup IPI is ignored on modern hardware etc).
855	 */
856	lapic_xapic_mode();
857
858	/* Initialize the PAT MSR. */
859	pmap_init_pat();
860
861	/* set up CPU registers and state */
862	cpu_setregs();
863
864	/* set up SSE/NX */
865	initializecpu();
866
867	/* set up FPU state on the AP */
868#ifdef __amd64__
869	fpuinit();
870#else
871	npxinit(false);
872#endif
873
874	if (cpu_ops.cpu_init)
875		cpu_ops.cpu_init();
876
877	/* A quick check from sanity claus */
878	cpuid = PCPU_GET(cpuid);
879	if (PCPU_GET(apic_id) != lapic_id()) {
880		printf("SMP: cpuid = %d\n", cpuid);
881		printf("SMP: actual apic_id = %d\n", lapic_id());
882		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
883		panic("cpuid mismatch! boom!!");
884	}
885
886	/* Initialize curthread. */
887	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
888	PCPU_SET(curthread, PCPU_GET(idlethread));
889
890	mca_init();
891
892	mtx_lock_spin(&ap_boot_mtx);
893
894	/* Init local apic for irq's */
895	lapic_setup(1);
896
897	/* Set memory range attributes for this CPU to match the BSP */
898	mem_range_AP_init();
899
900	smp_cpus++;
901
902	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid);
903	printf("SMP: AP CPU #%d Launched!\n", cpuid);
904
905	/* Determine if we are a logical CPU. */
906	if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread)
907		CPU_SET(cpuid, &logical_cpus_mask);
908
909	if (bootverbose)
910		lapic_dump("AP");
911
912	if (smp_cpus == mp_ncpus) {
913		/* enable IPI's, tlb shootdown, freezes etc */
914		atomic_store_rel_int(&smp_started, 1);
915	}
916
917#ifdef __amd64__
918	/*
919	 * Enable global pages TLB extension
920	 * This also implicitly flushes the TLB
921	 */
922	load_cr4(rcr4() | CR4_PGE);
923	if (pmap_pcid_enabled)
924		load_cr4(rcr4() | CR4_PCIDE);
925	load_ds(_udatasel);
926	load_es(_udatasel);
927	load_fs(_ufssel);
928#endif
929
930	mtx_unlock_spin(&ap_boot_mtx);
931
932	/* Wait until all the AP's are up. */
933	while (atomic_load_acq_int(&smp_started) == 0)
934		ia32_pause();
935
936	/* Start per-CPU event timers. */
937	cpu_initclocks_ap();
938
939	sched_throw(NULL);
940
941	panic("scheduler returned us to %s", __func__);
942	/* NOTREACHED */
943}
944
945/*******************************************************************
946 * local functions and data
947 */
948
949/*
950 * We tell the I/O APIC code about all the CPUs we want to receive
951 * interrupts.  If we don't want certain CPUs to receive IRQs we
952 * can simply not tell the I/O APIC code about them in this function.
953 * We also do not tell it about the BSP since it tells itself about
954 * the BSP internally to work with UP kernels and on UP machines.
955 */
956void
957set_interrupt_apic_ids(void)
958{
959	u_int i, apic_id;
960
961	for (i = 0; i < MAXCPU; i++) {
962		apic_id = cpu_apic_ids[i];
963		if (apic_id == -1)
964			continue;
965		if (cpu_info[apic_id].cpu_bsp)
966			continue;
967		if (cpu_info[apic_id].cpu_disabled)
968			continue;
969
970		/* Don't let hyperthreads service interrupts. */
971		if (cpu_info[apic_id].cpu_hyperthread)
972			continue;
973
974		intr_add_cpu(i);
975	}
976}
977
978
979#ifdef COUNT_XINVLTLB_HITS
980u_int xhits_gbl[MAXCPU];
981u_int xhits_pg[MAXCPU];
982u_int xhits_rng[MAXCPU];
983static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
984SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
985    sizeof(xhits_gbl), "IU", "");
986SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
987    sizeof(xhits_pg), "IU", "");
988SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
989    sizeof(xhits_rng), "IU", "");
990
991u_int ipi_global;
992u_int ipi_page;
993u_int ipi_range;
994u_int ipi_range_size;
995SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
996SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
997SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
998SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
999    0, "");
1000#endif /* COUNT_XINVLTLB_HITS */
1001
1002/*
1003 * Init and startup IPI.
1004 */
1005void
1006ipi_startup(int apic_id, int vector)
1007{
1008
1009	/*
1010	 * This attempts to follow the algorithm described in the
1011	 * Intel Multiprocessor Specification v1.4 in section B.4.
1012	 * For each IPI, we allow the local APIC ~20us to deliver the
1013	 * IPI.  If that times out, we panic.
1014	 */
1015
1016	/*
1017	 * first we do an INIT IPI: this INIT IPI might be run, resetting
1018	 * and running the target CPU. OR this INIT IPI might be latched (P5
1019	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
1020	 * ignored.
1021	 */
1022	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
1023	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
1024	lapic_ipi_wait(100);
1025
1026	/* Explicitly deassert the INIT IPI. */
1027	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
1028	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT,
1029	    apic_id);
1030
1031	DELAY(10000);		/* wait ~10mS */
1032
1033	/*
1034	 * next we do a STARTUP IPI: the previous INIT IPI might still be
1035	 * latched, (P5 bug) this 1st STARTUP would then terminate
1036	 * immediately, and the previously started INIT IPI would continue. OR
1037	 * the previous INIT IPI has already run. and this STARTUP IPI will
1038	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
1039	 * will run.
1040	 */
1041	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
1042	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
1043	    vector, apic_id);
1044	if (!lapic_ipi_wait(100))
1045		panic("Failed to deliver first STARTUP IPI to APIC %d",
1046		    apic_id);
1047	DELAY(200);		/* wait ~200uS */
1048
1049	/*
1050	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
1051	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
1052	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
1053	 * recognized after hardware RESET or INIT IPI.
1054	 */
1055	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
1056	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
1057	    vector, apic_id);
1058	if (!lapic_ipi_wait(100))
1059		panic("Failed to deliver second STARTUP IPI to APIC %d",
1060		    apic_id);
1061
1062	DELAY(200);		/* wait ~200uS */
1063}
1064
1065/*
1066 * Send an IPI to specified CPU handling the bitmap logic.
1067 */
1068void
1069ipi_send_cpu(int cpu, u_int ipi)
1070{
1071	u_int bitmap, old_pending, new_pending;
1072
1073	KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu));
1074
1075	if (IPI_IS_BITMAPED(ipi)) {
1076		bitmap = 1 << ipi;
1077		ipi = IPI_BITMAP_VECTOR;
1078		do {
1079			old_pending = cpu_ipi_pending[cpu];
1080			new_pending = old_pending | bitmap;
1081		} while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],
1082		    old_pending, new_pending));
1083		if (old_pending)
1084			return;
1085	}
1086	lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
1087}
1088
1089void
1090ipi_bitmap_handler(struct trapframe frame)
1091{
1092	struct trapframe *oldframe;
1093	struct thread *td;
1094	int cpu = PCPU_GET(cpuid);
1095	u_int ipi_bitmap;
1096
1097	critical_enter();
1098	td = curthread;
1099	td->td_intr_nesting_level++;
1100	oldframe = td->td_intr_frame;
1101	td->td_intr_frame = &frame;
1102	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
1103	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
1104#ifdef COUNT_IPIS
1105		(*ipi_preempt_counts[cpu])++;
1106#endif
1107		sched_preempt(td);
1108	}
1109	if (ipi_bitmap & (1 << IPI_AST)) {
1110#ifdef COUNT_IPIS
1111		(*ipi_ast_counts[cpu])++;
1112#endif
1113		/* Nothing to do for AST */
1114	}
1115	if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
1116#ifdef COUNT_IPIS
1117		(*ipi_hardclock_counts[cpu])++;
1118#endif
1119		hardclockintr();
1120	}
1121	td->td_intr_frame = oldframe;
1122	td->td_intr_nesting_level--;
1123	critical_exit();
1124}
1125
1126/*
1127 * send an IPI to a set of cpus.
1128 */
1129void
1130ipi_selected(cpuset_t cpus, u_int ipi)
1131{
1132	int cpu;
1133
1134	/*
1135	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1136	 * of help in order to understand what is the source.
1137	 * Set the mask of receiving CPUs for this purpose.
1138	 */
1139	if (ipi == IPI_STOP_HARD)
1140		CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus);
1141
1142	while ((cpu = CPU_FFS(&cpus)) != 0) {
1143		cpu--;
1144		CPU_CLR(cpu, &cpus);
1145		CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1146		ipi_send_cpu(cpu, ipi);
1147	}
1148}
1149
1150/*
1151 * send an IPI to a specific CPU.
1152 */
1153void
1154ipi_cpu(int cpu, u_int ipi)
1155{
1156
1157	/*
1158	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1159	 * of help in order to understand what is the source.
1160	 * Set the mask of receiving CPUs for this purpose.
1161	 */
1162	if (ipi == IPI_STOP_HARD)
1163		CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending);
1164
1165	CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1166	ipi_send_cpu(cpu, ipi);
1167}
1168
1169/*
1170 * send an IPI to all CPUs EXCEPT myself
1171 */
1172void
1173ipi_all_but_self(u_int ipi)
1174{
1175	cpuset_t other_cpus;
1176
1177	other_cpus = all_cpus;
1178	CPU_CLR(PCPU_GET(cpuid), &other_cpus);
1179	if (IPI_IS_BITMAPED(ipi)) {
1180		ipi_selected(other_cpus, ipi);
1181		return;
1182	}
1183
1184	/*
1185	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1186	 * of help in order to understand what is the source.
1187	 * Set the mask of receiving CPUs for this purpose.
1188	 */
1189	if (ipi == IPI_STOP_HARD)
1190		CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus);
1191
1192	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1193	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
1194}
1195
1196int
1197ipi_nmi_handler(void)
1198{
1199	u_int cpuid;
1200
1201	/*
1202	 * As long as there is not a simple way to know about a NMI's
1203	 * source, if the bitmask for the current CPU is present in
1204	 * the global pending bitword an IPI_STOP_HARD has been issued
1205	 * and should be handled.
1206	 */
1207	cpuid = PCPU_GET(cpuid);
1208	if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending))
1209		return (1);
1210
1211	CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending);
1212	cpustop_handler();
1213	return (0);
1214}
1215
1216/*
1217 * Handle an IPI_STOP by saving our current context and spinning until we
1218 * are resumed.
1219 */
1220void
1221cpustop_handler(void)
1222{
1223	u_int cpu;
1224
1225	cpu = PCPU_GET(cpuid);
1226
1227	savectx(&stoppcbs[cpu]);
1228
1229	/* Indicate that we are stopped */
1230	CPU_SET_ATOMIC(cpu, &stopped_cpus);
1231
1232	/* Wait for restart */
1233	while (!CPU_ISSET(cpu, &started_cpus))
1234	    ia32_pause();
1235
1236	CPU_CLR_ATOMIC(cpu, &started_cpus);
1237	CPU_CLR_ATOMIC(cpu, &stopped_cpus);
1238
1239#if defined(__amd64__) && defined(DDB)
1240	amd64_db_resume_dbreg();
1241#endif
1242
1243	if (cpu == 0 && cpustop_restartfunc != NULL) {
1244		cpustop_restartfunc();
1245		cpustop_restartfunc = NULL;
1246	}
1247}
1248
1249/*
1250 * Handle an IPI_SUSPEND by saving our current context and spinning until we
1251 * are resumed.
1252 */
1253void
1254cpususpend_handler(void)
1255{
1256	u_int cpu;
1257
1258	mtx_assert(&smp_ipi_mtx, MA_NOTOWNED);
1259
1260	cpu = PCPU_GET(cpuid);
1261	if (savectx(&susppcbs[cpu]->sp_pcb)) {
1262#ifdef __amd64__
1263		fpususpend(susppcbs[cpu]->sp_fpususpend);
1264#else
1265		npxsuspend(susppcbs[cpu]->sp_fpususpend);
1266#endif
1267		wbinvd();
1268		CPU_SET_ATOMIC(cpu, &suspended_cpus);
1269	} else {
1270#ifdef __amd64__
1271		fpuresume(susppcbs[cpu]->sp_fpususpend);
1272#else
1273		npxresume(susppcbs[cpu]->sp_fpususpend);
1274#endif
1275		pmap_init_pat();
1276		initializecpu();
1277		PCPU_SET(switchtime, 0);
1278		PCPU_SET(switchticks, ticks);
1279
1280		/* Indicate that we are resumed */
1281		CPU_CLR_ATOMIC(cpu, &suspended_cpus);
1282	}
1283
1284	/* Wait for resume */
1285	while (!CPU_ISSET(cpu, &started_cpus))
1286		ia32_pause();
1287
1288	if (cpu_ops.cpu_resume)
1289		cpu_ops.cpu_resume();
1290#ifdef __amd64__
1291	if (vmm_resume_p)
1292		vmm_resume_p();
1293#endif
1294
1295	/* Resume MCA and local APIC */
1296	lapic_xapic_mode();
1297	mca_resume();
1298	lapic_setup(0);
1299
1300	/* Indicate that we are resumed */
1301	CPU_CLR_ATOMIC(cpu, &suspended_cpus);
1302	CPU_CLR_ATOMIC(cpu, &started_cpus);
1303}
1304
1305
1306void
1307invlcache_handler(void)
1308{
1309#ifdef COUNT_IPIS
1310	(*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
1311#endif /* COUNT_IPIS */
1312
1313	wbinvd();
1314	atomic_add_int(&smp_tlb_wait, 1);
1315}
1316
1317/*
1318 * This is called once the rest of the system is up and running and we're
1319 * ready to let the AP's out of the pen.
1320 */
1321static void
1322release_aps(void *dummy __unused)
1323{
1324
1325	if (mp_ncpus == 1)
1326		return;
1327	atomic_store_rel_int(&aps_ready, 1);
1328	while (smp_started == 0)
1329		ia32_pause();
1330}
1331SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1332
1333#ifdef COUNT_IPIS
1334/*
1335 * Setup interrupt counters for IPI handlers.
1336 */
1337static void
1338mp_ipi_intrcnt(void *dummy)
1339{
1340	char buf[64];
1341	int i;
1342
1343	CPU_FOREACH(i) {
1344		snprintf(buf, sizeof(buf), "cpu%d:invltlb", i);
1345		intrcnt_add(buf, &ipi_invltlb_counts[i]);
1346		snprintf(buf, sizeof(buf), "cpu%d:invlrng", i);
1347		intrcnt_add(buf, &ipi_invlrng_counts[i]);
1348		snprintf(buf, sizeof(buf), "cpu%d:invlpg", i);
1349		intrcnt_add(buf, &ipi_invlpg_counts[i]);
1350		snprintf(buf, sizeof(buf), "cpu%d:invlcache", i);
1351		intrcnt_add(buf, &ipi_invlcache_counts[i]);
1352		snprintf(buf, sizeof(buf), "cpu%d:preempt", i);
1353		intrcnt_add(buf, &ipi_preempt_counts[i]);
1354		snprintf(buf, sizeof(buf), "cpu%d:ast", i);
1355		intrcnt_add(buf, &ipi_ast_counts[i]);
1356		snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i);
1357		intrcnt_add(buf, &ipi_rendezvous_counts[i]);
1358		snprintf(buf, sizeof(buf), "cpu%d:hardclock", i);
1359		intrcnt_add(buf, &ipi_hardclock_counts[i]);
1360	}
1361}
1362SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
1363#endif
1364
1365/*
1366 * Flush the TLB on other CPU's
1367 */
1368
1369/* Variables needed for SMP tlb shootdown. */
1370static vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
1371pmap_t smp_tlb_pmap;
1372volatile int smp_tlb_wait;
1373
1374#ifdef __amd64__
1375#define	read_eflags() read_rflags()
1376#endif
1377
1378static void
1379smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
1380    vm_offset_t addr1, vm_offset_t addr2)
1381{
1382	int cpu, ncpu, othercpus;
1383
1384	othercpus = mp_ncpus - 1;	/* does not shootdown self */
1385
1386	/*
1387	 * Check for other cpus.  Return if none.
1388	 */
1389	if (CPU_ISFULLSET(&mask)) {
1390		if (othercpus < 1)
1391			return;
1392	} else {
1393		CPU_CLR(PCPU_GET(cpuid), &mask);
1394		if (CPU_EMPTY(&mask))
1395			return;
1396	}
1397
1398	if (!(read_eflags() & PSL_I))
1399		panic("%s: interrupts disabled", __func__);
1400	mtx_lock_spin(&smp_ipi_mtx);
1401	smp_tlb_addr1 = addr1;
1402	smp_tlb_addr2 = addr2;
1403	smp_tlb_pmap = pmap;
1404	smp_tlb_wait =  0;
1405	if (CPU_ISFULLSET(&mask)) {
1406		ncpu = othercpus;
1407		ipi_all_but_self(vector);
1408	} else {
1409		ncpu = 0;
1410		while ((cpu = CPU_FFS(&mask)) != 0) {
1411			cpu--;
1412			CPU_CLR(cpu, &mask);
1413			CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
1414			    cpu, vector);
1415			ipi_send_cpu(cpu, vector);
1416			ncpu++;
1417		}
1418	}
1419	while (smp_tlb_wait < ncpu)
1420		ia32_pause();
1421	mtx_unlock_spin(&smp_ipi_mtx);
1422}
1423
1424void
1425smp_masked_invltlb(cpuset_t mask, pmap_t pmap)
1426{
1427
1428	if (smp_started) {
1429		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0);
1430#ifdef COUNT_XINVLTLB_HITS
1431		ipi_global++;
1432#endif
1433	}
1434}
1435
1436void
1437smp_masked_invlpg(cpuset_t mask, vm_offset_t addr)
1438{
1439
1440	if (smp_started) {
1441		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, NULL, addr, 0);
1442#ifdef COUNT_XINVLTLB_HITS
1443		ipi_page++;
1444#endif
1445	}
1446}
1447
1448void
1449smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2)
1450{
1451
1452	if (smp_started) {
1453		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, NULL,
1454		    addr1, addr2);
1455#ifdef COUNT_XINVLTLB_HITS
1456		ipi_range++;
1457		ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
1458#endif
1459	}
1460}
1461
1462void
1463smp_cache_flush(void)
1464{
1465
1466	if (smp_started) {
1467		smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL,
1468		    0, 0);
1469	}
1470}
1471
1472/*
1473 * Handlers for TLB related IPIs
1474 */
1475void
1476invltlb_handler(void)
1477{
1478#ifdef COUNT_XINVLTLB_HITS
1479	xhits_gbl[PCPU_GET(cpuid)]++;
1480#endif /* COUNT_XINVLTLB_HITS */
1481#ifdef COUNT_IPIS
1482	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
1483#endif /* COUNT_IPIS */
1484
1485	if (smp_tlb_pmap == kernel_pmap)
1486		invltlb_glob();
1487	else
1488		invltlb();
1489	atomic_add_int(&smp_tlb_wait, 1);
1490}
1491
1492void
1493invlpg_handler(void)
1494{
1495#ifdef COUNT_XINVLTLB_HITS
1496	xhits_pg[PCPU_GET(cpuid)]++;
1497#endif /* COUNT_XINVLTLB_HITS */
1498#ifdef COUNT_IPIS
1499	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
1500#endif /* COUNT_IPIS */
1501
1502	invlpg(smp_tlb_addr1);
1503	atomic_add_int(&smp_tlb_wait, 1);
1504}
1505
1506void
1507invlrng_handler(void)
1508{
1509	vm_offset_t addr;
1510
1511#ifdef COUNT_XINVLTLB_HITS
1512	xhits_rng[PCPU_GET(cpuid)]++;
1513#endif /* COUNT_XINVLTLB_HITS */
1514#ifdef COUNT_IPIS
1515	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
1516#endif /* COUNT_IPIS */
1517
1518	addr = smp_tlb_addr1;
1519	do {
1520		invlpg(addr);
1521		addr += PAGE_SIZE;
1522	} while (addr < smp_tlb_addr2);
1523
1524	atomic_add_int(&smp_tlb_wait, 1);
1525}
1526