mp_machdep.c revision 214630
1/*-
2 * Copyright (c) 1996, by Steve Passe
3 * Copyright (c) 2003, by Peter Wemm
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. The name of the developer may NOT be used to endorse or promote products
12 *    derived from this software without specific prior written permission.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/amd64/amd64/mp_machdep.c 214630 2010-11-01 17:40:35Z jhb $");
29
30#include "opt_cpu.h"
31#include "opt_kstack_pages.h"
32#include "opt_mp_watchdog.h"
33#include "opt_sched.h"
34#include "opt_smp.h"
35
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/bus.h>
39#ifdef GPROF
40#include <sys/gmon.h>
41#endif
42#include <sys/kernel.h>
43#include <sys/ktr.h>
44#include <sys/lock.h>
45#include <sys/malloc.h>
46#include <sys/memrange.h>
47#include <sys/mutex.h>
48#include <sys/pcpu.h>
49#include <sys/proc.h>
50#include <sys/sched.h>
51#include <sys/smp.h>
52#include <sys/sysctl.h>
53
54#include <vm/vm.h>
55#include <vm/vm_param.h>
56#include <vm/pmap.h>
57#include <vm/vm_kern.h>
58#include <vm/vm_extern.h>
59
60#include <machine/apicreg.h>
61#include <machine/clock.h>
62#include <machine/cputypes.h>
63#include <machine/cpufunc.h>
64#include <x86/mca.h>
65#include <machine/md_var.h>
66#include <machine/mp_watchdog.h>
67#include <machine/pcb.h>
68#include <machine/psl.h>
69#include <machine/smp.h>
70#include <machine/specialreg.h>
71#include <machine/tss.h>
72
73#define WARMBOOT_TARGET		0
74#define WARMBOOT_OFF		(KERNBASE + 0x0467)
75#define WARMBOOT_SEG		(KERNBASE + 0x0469)
76
77#define CMOS_REG		(0x70)
78#define CMOS_DATA		(0x71)
79#define BIOS_RESET		(0x0f)
80#define BIOS_WARM		(0x0a)
81
82/* lock region used by kernel profiling */
83int	mcount_lock;
84
85int	mp_naps;		/* # of Applications processors */
86int	boot_cpu_id = -1;	/* designated BSP */
87
88extern  struct pcpu __pcpu[];
89
90/* AP uses this during bootstrap.  Do not staticize.  */
91char *bootSTK;
92static int bootAP;
93
94/* Free these after use */
95void *bootstacks[MAXCPU];
96
97/* Temporary variables for init_secondary()  */
98char *doublefault_stack;
99char *nmi_stack;
100void *dpcpu;
101
102struct pcb stoppcbs[MAXCPU];
103struct pcb **susppcbs = NULL;
104
105/* Variables needed for SMP tlb shootdown. */
106vm_offset_t smp_tlb_addr1;
107vm_offset_t smp_tlb_addr2;
108volatile int smp_tlb_wait;
109
110#ifdef COUNT_IPIS
111/* Interrupt counts. */
112static u_long *ipi_preempt_counts[MAXCPU];
113static u_long *ipi_ast_counts[MAXCPU];
114u_long *ipi_invltlb_counts[MAXCPU];
115u_long *ipi_invlrng_counts[MAXCPU];
116u_long *ipi_invlpg_counts[MAXCPU];
117u_long *ipi_invlcache_counts[MAXCPU];
118u_long *ipi_rendezvous_counts[MAXCPU];
119u_long *ipi_lazypmap_counts[MAXCPU];
120static u_long *ipi_hardclock_counts[MAXCPU];
121#endif
122
123extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
124
125/*
126 * Local data and functions.
127 */
128
129static volatile cpumask_t ipi_nmi_pending;
130
131/* used to hold the AP's until we are ready to release them */
132static struct mtx ap_boot_mtx;
133
134/* Set to 1 once we're ready to let the APs out of the pen. */
135static volatile int aps_ready = 0;
136
137/*
138 * Store data from cpu_add() until later in the boot when we actually setup
139 * the APs.
140 */
141struct cpu_info {
142	int	cpu_present:1;
143	int	cpu_bsp:1;
144	int	cpu_disabled:1;
145	int	cpu_hyperthread:1;
146} static cpu_info[MAX_APIC_ID + 1];
147int cpu_apic_ids[MAXCPU];
148int apic_cpuids[MAX_APIC_ID + 1];
149
150/* Holds pending bitmap based IPIs per CPU */
151static volatile u_int cpu_ipi_pending[MAXCPU];
152
153static u_int boot_address;
154static int cpu_logical;			/* logical cpus per core */
155static int cpu_cores;			/* cores per package */
156
157static void	assign_cpu_ids(void);
158static void	set_interrupt_apic_ids(void);
159static int	start_all_aps(void);
160static int	start_ap(int apic_id);
161static void	release_aps(void *dummy);
162
163static int	hlt_logical_cpus;
164static u_int	hyperthreading_cpus;	/* logical cpus sharing L1 cache */
165static cpumask_t	hyperthreading_cpus_mask;
166static int	hyperthreading_allowed = 1;
167static struct	sysctl_ctx_list logical_cpu_clist;
168static u_int	bootMP_size;
169
170static void
171mem_range_AP_init(void)
172{
173	if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
174		mem_range_softc.mr_op->initAP(&mem_range_softc);
175}
176
177static void
178topo_probe_amd(void)
179{
180
181	/* AMD processors do not support HTT. */
182	cpu_cores = (amd_feature2 & AMDID2_CMP) != 0 ?
183	    (cpu_procinfo2 & AMDID_CMP_CORES) + 1 : 1;
184	cpu_logical = 1;
185}
186
187/*
188 * Round up to the next power of two, if necessary, and then
189 * take log2.
190 * Returns -1 if argument is zero.
191 */
192static __inline int
193mask_width(u_int x)
194{
195
196	return (fls(x << (1 - powerof2(x))) - 1);
197}
198
199static void
200topo_probe_0x4(void)
201{
202	u_int p[4];
203	int pkg_id_bits;
204	int core_id_bits;
205	int max_cores;
206	int max_logical;
207	int id;
208
209	/* Both zero and one here mean one logical processor per package. */
210	max_logical = (cpu_feature & CPUID_HTT) != 0 ?
211	    (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
212	if (max_logical <= 1)
213		return;
214
215	/*
216	 * Because of uniformity assumption we examine only
217	 * those logical processors that belong to the same
218	 * package as BSP.  Further, we count number of
219	 * logical processors that belong to the same core
220	 * as BSP thus deducing number of threads per core.
221	 */
222	cpuid_count(0x04, 0, p);
223	max_cores = ((p[0] >> 26) & 0x3f) + 1;
224	core_id_bits = mask_width(max_logical/max_cores);
225	if (core_id_bits < 0)
226		return;
227	pkg_id_bits = core_id_bits + mask_width(max_cores);
228
229	for (id = 0; id <= MAX_APIC_ID; id++) {
230		/* Check logical CPU availability. */
231		if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled)
232			continue;
233		/* Check if logical CPU has the same package ID. */
234		if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits))
235			continue;
236		cpu_cores++;
237		/* Check if logical CPU has the same package and core IDs. */
238		if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits))
239			cpu_logical++;
240	}
241
242	cpu_cores /= cpu_logical;
243	hyperthreading_cpus = cpu_logical;
244}
245
246static void
247topo_probe_0xb(void)
248{
249	u_int p[4];
250	int bits;
251	int cnt;
252	int i;
253	int logical;
254	int type;
255	int x;
256
257	/* We only support three levels for now. */
258	for (i = 0; i < 3; i++) {
259		cpuid_count(0x0b, i, p);
260
261		/* Fall back if CPU leaf 11 doesn't really exist. */
262		if (i == 0 && p[1] == 0) {
263			topo_probe_0x4();
264			return;
265		}
266
267		bits = p[0] & 0x1f;
268		logical = p[1] &= 0xffff;
269		type = (p[2] >> 8) & 0xff;
270		if (type == 0 || logical == 0)
271			break;
272		/*
273		 * Because of uniformity assumption we examine only
274		 * those logical processors that belong to the same
275		 * package as BSP.
276		 */
277		for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) {
278			if (!cpu_info[x].cpu_present ||
279			    cpu_info[x].cpu_disabled)
280				continue;
281			if (x >> bits == boot_cpu_id >> bits)
282				cnt++;
283		}
284		if (type == CPUID_TYPE_SMT)
285			cpu_logical = cnt;
286		else if (type == CPUID_TYPE_CORE)
287			cpu_cores = cnt;
288	}
289	if (cpu_logical == 0)
290		cpu_logical = 1;
291	cpu_cores /= cpu_logical;
292}
293
294/*
295 * Both topology discovery code and code that consumes topology
296 * information assume top-down uniformity of the topology.
297 * That is, all physical packages must be identical and each
298 * core in a package must have the same number of threads.
299 * Topology information is queried only on BSP, on which this
300 * code runs and for which it can query CPUID information.
301 * Then topology is extrapolated on all packages using the
302 * uniformity assumption.
303 */
304static void
305topo_probe(void)
306{
307	static int cpu_topo_probed = 0;
308
309	if (cpu_topo_probed)
310		return;
311
312	logical_cpus_mask = 0;
313	if (cpu_vendor_id == CPU_VENDOR_AMD)
314		topo_probe_amd();
315	else if (cpu_vendor_id == CPU_VENDOR_INTEL) {
316		/*
317		 * See Intel(R) 64 Architecture Processor
318		 * Topology Enumeration article for details.
319		 *
320		 * Note that 0x1 <= cpu_high < 4 case should be
321		 * compatible with topo_probe_0x4() logic when
322		 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
323		 * or it should trigger the fallback otherwise.
324		 */
325		if (cpu_high >= 0xb)
326			topo_probe_0xb();
327		else if (cpu_high >= 0x1)
328			topo_probe_0x4();
329	}
330
331	/*
332	 * Fallback: assume each logical CPU is in separate
333	 * physical package.  That is, no multi-core, no SMT.
334	 */
335	if (cpu_cores == 0)
336		cpu_cores = 1;
337	if (cpu_logical == 0)
338		cpu_logical = 1;
339	cpu_topo_probed = 1;
340}
341
342struct cpu_group *
343cpu_topo(void)
344{
345	int cg_flags;
346
347	/*
348	 * Determine whether any threading flags are
349	 * necessry.
350	 */
351	topo_probe();
352	if (cpu_logical > 1 && hyperthreading_cpus)
353		cg_flags = CG_FLAG_HTT;
354	else if (cpu_logical > 1)
355		cg_flags = CG_FLAG_SMT;
356	else
357		cg_flags = 0;
358	if (mp_ncpus % (cpu_cores * cpu_logical) != 0) {
359		printf("WARNING: Non-uniform processors.\n");
360		printf("WARNING: Using suboptimal topology.\n");
361		return (smp_topo_none());
362	}
363	/*
364	 * No multi-core or hyper-threaded.
365	 */
366	if (cpu_logical * cpu_cores == 1)
367		return (smp_topo_none());
368	/*
369	 * Only HTT no multi-core.
370	 */
371	if (cpu_logical > 1 && cpu_cores == 1)
372		return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags));
373	/*
374	 * Only multi-core no HTT.
375	 */
376	if (cpu_cores > 1 && cpu_logical == 1)
377		return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags));
378	/*
379	 * Both HTT and multi-core.
380	 */
381	return (smp_topo_2level(CG_SHARE_L2, cpu_cores,
382	    CG_SHARE_L1, cpu_logical, cg_flags));
383}
384
385/*
386 * Calculate usable address in base memory for AP trampoline code.
387 */
388u_int
389mp_bootaddress(u_int basemem)
390{
391
392	bootMP_size = mptramp_end - mptramp_start;
393	boot_address = trunc_page(basemem * 1024); /* round down to 4k boundary */
394	if (((basemem * 1024) - boot_address) < bootMP_size)
395		boot_address -= PAGE_SIZE;	/* not enough, lower by 4k */
396	/* 3 levels of page table pages */
397	mptramp_pagetables = boot_address - (PAGE_SIZE * 3);
398
399	return mptramp_pagetables;
400}
401
402void
403cpu_add(u_int apic_id, char boot_cpu)
404{
405
406	if (apic_id > MAX_APIC_ID) {
407		panic("SMP: APIC ID %d too high", apic_id);
408		return;
409	}
410	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
411	    apic_id));
412	cpu_info[apic_id].cpu_present = 1;
413	if (boot_cpu) {
414		KASSERT(boot_cpu_id == -1,
415		    ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
416		    boot_cpu_id));
417		boot_cpu_id = apic_id;
418		cpu_info[apic_id].cpu_bsp = 1;
419	}
420	if (mp_ncpus < MAXCPU) {
421		mp_ncpus++;
422		mp_maxid = mp_ncpus -1;
423	}
424	if (bootverbose)
425		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
426		    "AP");
427}
428
429void
430cpu_mp_setmaxid(void)
431{
432
433	/*
434	 * mp_maxid should be already set by calls to cpu_add().
435	 * Just sanity check its value here.
436	 */
437	if (mp_ncpus == 0)
438		KASSERT(mp_maxid == 0,
439		    ("%s: mp_ncpus is zero, but mp_maxid is not", __func__));
440	else if (mp_ncpus == 1)
441		mp_maxid = 0;
442	else
443		KASSERT(mp_maxid >= mp_ncpus - 1,
444		    ("%s: counters out of sync: max %d, count %d", __func__,
445			mp_maxid, mp_ncpus));
446}
447
448int
449cpu_mp_probe(void)
450{
451
452	/*
453	 * Always record BSP in CPU map so that the mbuf init code works
454	 * correctly.
455	 */
456	all_cpus = 1;
457	if (mp_ncpus == 0) {
458		/*
459		 * No CPUs were found, so this must be a UP system.  Setup
460		 * the variables to represent a system with a single CPU
461		 * with an id of 0.
462		 */
463		mp_ncpus = 1;
464		return (0);
465	}
466
467	/* At least one CPU was found. */
468	if (mp_ncpus == 1) {
469		/*
470		 * One CPU was found, so this must be a UP system with
471		 * an I/O APIC.
472		 */
473		mp_maxid = 0;
474		return (0);
475	}
476
477	/* At least two CPUs were found. */
478	return (1);
479}
480
481/*
482 * Initialize the IPI handlers and start up the AP's.
483 */
484void
485cpu_mp_start(void)
486{
487	int i;
488
489	/* Initialize the logical ID to APIC ID table. */
490	for (i = 0; i < MAXCPU; i++) {
491		cpu_apic_ids[i] = -1;
492		cpu_ipi_pending[i] = 0;
493	}
494
495	/* Install an inter-CPU IPI for TLB invalidation */
496	setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0);
497	setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0);
498	setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0);
499
500	/* Install an inter-CPU IPI for cache invalidation. */
501	setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYSIGT, SEL_KPL, 0);
502
503	/* Install an inter-CPU IPI for all-CPU rendezvous */
504	setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0);
505
506	/* Install generic inter-CPU IPI handler */
507	setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler),
508	       SDT_SYSIGT, SEL_KPL, 0);
509
510	/* Install an inter-CPU IPI for CPU stop/restart */
511	setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0);
512
513	/* Install an inter-CPU IPI for CPU suspend/resume */
514	setidt(IPI_SUSPEND, IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0);
515
516	/* Set boot_cpu_id if needed. */
517	if (boot_cpu_id == -1) {
518		boot_cpu_id = PCPU_GET(apic_id);
519		cpu_info[boot_cpu_id].cpu_bsp = 1;
520	} else
521		KASSERT(boot_cpu_id == PCPU_GET(apic_id),
522		    ("BSP's APIC ID doesn't match boot_cpu_id"));
523
524	/* Probe logical/physical core configuration. */
525	topo_probe();
526
527	assign_cpu_ids();
528
529	/* Start each Application Processor */
530	start_all_aps();
531
532	set_interrupt_apic_ids();
533}
534
535
536/*
537 * Print various information about the SMP system hardware and setup.
538 */
539void
540cpu_mp_announce(void)
541{
542	const char *hyperthread;
543	int i;
544
545	printf("FreeBSD/SMP: %d package(s) x %d core(s)",
546	    mp_ncpus / (cpu_cores * cpu_logical), cpu_cores);
547	if (hyperthreading_cpus > 1)
548	    printf(" x %d HTT threads", cpu_logical);
549	else if (cpu_logical > 1)
550	    printf(" x %d SMT threads", cpu_logical);
551	printf("\n");
552
553	/* List active CPUs first. */
554	printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
555	for (i = 1; i < mp_ncpus; i++) {
556		if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread)
557			hyperthread = "/HT";
558		else
559			hyperthread = "";
560		printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread,
561		    cpu_apic_ids[i]);
562	}
563
564	/* List disabled CPUs last. */
565	for (i = 0; i <= MAX_APIC_ID; i++) {
566		if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled)
567			continue;
568		if (cpu_info[i].cpu_hyperthread)
569			hyperthread = "/HT";
570		else
571			hyperthread = "";
572		printf("  cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread,
573		    i);
574	}
575}
576
577/*
578 * AP CPU's call this to initialize themselves.
579 */
580void
581init_secondary(void)
582{
583	struct pcpu *pc;
584	struct nmi_pcpu *np;
585	u_int64_t msr, cr0;
586	int cpu, gsel_tss, x;
587	struct region_descriptor ap_gdt;
588
589	/* Set by the startup code for us to use */
590	cpu = bootAP;
591
592	/* Init tss */
593	common_tss[cpu] = common_tss[0];
594	common_tss[cpu].tss_rsp0 = 0;   /* not used until after switch */
595	common_tss[cpu].tss_iobase = sizeof(struct amd64tss) +
596	    IOPAGES * PAGE_SIZE;
597	common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE];
598
599	/* The NMI stack runs on IST2. */
600	np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1;
601	common_tss[cpu].tss_ist2 = (long) np;
602
603	/* Prepare private GDT */
604	gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu];
605	for (x = 0; x < NGDT; x++) {
606		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
607		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1))
608			ssdtosd(&gdt_segs[x], &gdt[NGDT * cpu + x]);
609	}
610	ssdtosyssd(&gdt_segs[GPROC0_SEL],
611	    (struct system_segment_descriptor *)&gdt[NGDT * cpu + GPROC0_SEL]);
612	ap_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
613	ap_gdt.rd_base =  (long) &gdt[NGDT * cpu];
614	lgdt(&ap_gdt);			/* does magic intra-segment return */
615
616	/* Get per-cpu data */
617	pc = &__pcpu[cpu];
618
619	/* prime data page for it to use */
620	pcpu_init(pc, cpu, sizeof(struct pcpu));
621	dpcpu_init(dpcpu, cpu);
622	pc->pc_apic_id = cpu_apic_ids[cpu];
623	pc->pc_prvspace = pc;
624	pc->pc_curthread = 0;
625	pc->pc_tssp = &common_tss[cpu];
626	pc->pc_commontssp = &common_tss[cpu];
627	pc->pc_rsp0 = 0;
628	pc->pc_tss = (struct system_segment_descriptor *)&gdt[NGDT * cpu +
629	    GPROC0_SEL];
630	pc->pc_fs32p = &gdt[NGDT * cpu + GUFS32_SEL];
631	pc->pc_gs32p = &gdt[NGDT * cpu + GUGS32_SEL];
632	pc->pc_ldt = (struct system_segment_descriptor *)&gdt[NGDT * cpu +
633	    GUSERLDT_SEL];
634
635	/* Save the per-cpu pointer for use by the NMI handler. */
636	np->np_pcpu = (register_t) pc;
637
638	wrmsr(MSR_FSBASE, 0);		/* User value */
639	wrmsr(MSR_GSBASE, (u_int64_t)pc);
640	wrmsr(MSR_KGSBASE, (u_int64_t)pc);	/* XXX User value while we're in the kernel */
641
642	lidt(&r_idt);
643
644	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
645	ltr(gsel_tss);
646
647	/*
648	 * Set to a known state:
649	 * Set by mpboot.s: CR0_PG, CR0_PE
650	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
651	 */
652	cr0 = rcr0();
653	cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
654	load_cr0(cr0);
655
656	/* Set up the fast syscall stuff */
657	msr = rdmsr(MSR_EFER) | EFER_SCE;
658	wrmsr(MSR_EFER, msr);
659	wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
660	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
661	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
662	      ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
663	wrmsr(MSR_STAR, msr);
664	wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
665
666	/* Disable local APIC just to be sure. */
667	lapic_disable();
668
669	/* signal our startup to the BSP. */
670	mp_naps++;
671
672	/* Spin until the BSP releases the AP's. */
673	while (!aps_ready)
674		ia32_pause();
675
676	/* Initialize the PAT MSR. */
677	pmap_init_pat();
678
679	/* set up CPU registers and state */
680	cpu_setregs();
681
682	/* set up SSE/NX registers */
683	initializecpu();
684
685	/* set up FPU state on the AP */
686	fpuinit();
687
688	/* A quick check from sanity claus */
689	if (PCPU_GET(apic_id) != lapic_id()) {
690		printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
691		printf("SMP: actual apic_id = %d\n", lapic_id());
692		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
693		panic("cpuid mismatch! boom!!");
694	}
695
696	/* Initialize curthread. */
697	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
698	PCPU_SET(curthread, PCPU_GET(idlethread));
699
700	mca_init();
701
702	mtx_lock_spin(&ap_boot_mtx);
703
704	/* Init local apic for irq's */
705	lapic_setup(1);
706
707	/* Set memory range attributes for this CPU to match the BSP */
708	mem_range_AP_init();
709
710	smp_cpus++;
711
712	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid));
713	printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
714
715	/* Determine if we are a logical CPU. */
716	/* XXX Calculation depends on cpu_logical being a power of 2, e.g. 2 */
717	if (cpu_logical > 1 && PCPU_GET(apic_id) % cpu_logical != 0)
718		logical_cpus_mask |= PCPU_GET(cpumask);
719
720	/* Determine if we are a hyperthread. */
721	if (hyperthreading_cpus > 1 &&
722	    PCPU_GET(apic_id) % hyperthreading_cpus != 0)
723		hyperthreading_cpus_mask |= PCPU_GET(cpumask);
724
725	/* Build our map of 'other' CPUs. */
726	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
727
728	if (bootverbose)
729		lapic_dump("AP");
730
731	if (smp_cpus == mp_ncpus) {
732		/* enable IPI's, tlb shootdown, freezes etc */
733		atomic_store_rel_int(&smp_started, 1);
734		smp_active = 1;	 /* historic */
735	}
736
737	/*
738	 * Enable global pages TLB extension
739	 * This also implicitly flushes the TLB
740	 */
741
742	load_cr4(rcr4() | CR4_PGE);
743	load_ds(_udatasel);
744	load_es(_udatasel);
745	load_fs(_ufssel);
746	mtx_unlock_spin(&ap_boot_mtx);
747
748	/* Wait until all the AP's are up. */
749	while (smp_started == 0)
750		ia32_pause();
751
752	/* Start per-CPU event timers. */
753	cpu_initclocks_ap();
754
755	sched_throw(NULL);
756
757	panic("scheduler returned us to %s", __func__);
758	/* NOTREACHED */
759}
760
761/*******************************************************************
762 * local functions and data
763 */
764
765/*
766 * We tell the I/O APIC code about all the CPUs we want to receive
767 * interrupts.  If we don't want certain CPUs to receive IRQs we
768 * can simply not tell the I/O APIC code about them in this function.
769 * We also do not tell it about the BSP since it tells itself about
770 * the BSP internally to work with UP kernels and on UP machines.
771 */
772static void
773set_interrupt_apic_ids(void)
774{
775	u_int i, apic_id;
776
777	for (i = 0; i < MAXCPU; i++) {
778		apic_id = cpu_apic_ids[i];
779		if (apic_id == -1)
780			continue;
781		if (cpu_info[apic_id].cpu_bsp)
782			continue;
783		if (cpu_info[apic_id].cpu_disabled)
784			continue;
785
786		/* Don't let hyperthreads service interrupts. */
787		if (hyperthreading_cpus > 1 &&
788		    apic_id % hyperthreading_cpus != 0)
789			continue;
790
791		intr_add_cpu(i);
792	}
793}
794
795/*
796 * Assign logical CPU IDs to local APICs.
797 */
798static void
799assign_cpu_ids(void)
800{
801	u_int i;
802
803	TUNABLE_INT_FETCH("machdep.hyperthreading_allowed",
804	    &hyperthreading_allowed);
805
806	/* Check for explicitly disabled CPUs. */
807	for (i = 0; i <= MAX_APIC_ID; i++) {
808		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp)
809			continue;
810
811		if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) {
812			cpu_info[i].cpu_hyperthread = 1;
813#if defined(SCHED_ULE)
814			/*
815			 * Don't use HT CPU if it has been disabled by a
816			 * tunable.
817			 */
818			if (hyperthreading_allowed == 0) {
819				cpu_info[i].cpu_disabled = 1;
820				continue;
821			}
822#endif
823		}
824
825		/* Don't use this CPU if it has been disabled by a tunable. */
826		if (resource_disabled("lapic", i)) {
827			cpu_info[i].cpu_disabled = 1;
828			continue;
829		}
830	}
831
832	/*
833	 * Assign CPU IDs to local APIC IDs and disable any CPUs
834	 * beyond MAXCPU.  CPU 0 is always assigned to the BSP.
835	 *
836	 * To minimize confusion for userland, we attempt to number
837	 * CPUs such that all threads and cores in a package are
838	 * grouped together.  For now we assume that the BSP is always
839	 * the first thread in a package and just start adding APs
840	 * starting with the BSP's APIC ID.
841	 */
842	mp_ncpus = 1;
843	cpu_apic_ids[0] = boot_cpu_id;
844	apic_cpuids[boot_cpu_id] = 0;
845	for (i = boot_cpu_id + 1; i != boot_cpu_id;
846	     i == MAX_APIC_ID ? i = 0 : i++) {
847		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp ||
848		    cpu_info[i].cpu_disabled)
849			continue;
850
851		if (mp_ncpus < MAXCPU) {
852			cpu_apic_ids[mp_ncpus] = i;
853			apic_cpuids[i] = mp_ncpus;
854			mp_ncpus++;
855		} else
856			cpu_info[i].cpu_disabled = 1;
857	}
858	KASSERT(mp_maxid >= mp_ncpus - 1,
859	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
860	    mp_ncpus));
861}
862
863/*
864 * start each AP in our list
865 */
866static int
867start_all_aps(void)
868{
869	vm_offset_t va = boot_address + KERNBASE;
870	u_int64_t *pt4, *pt3, *pt2;
871	u_int32_t mpbioswarmvec;
872	int apic_id, cpu, i;
873	u_char mpbiosreason;
874
875	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
876
877	/* install the AP 1st level boot code */
878	pmap_kenter(va, boot_address);
879	pmap_invalidate_page(kernel_pmap, va);
880	bcopy(mptramp_start, (void *)va, bootMP_size);
881
882	/* Locate the page tables, they'll be below the trampoline */
883	pt4 = (u_int64_t *)(uintptr_t)(mptramp_pagetables + KERNBASE);
884	pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t);
885	pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t);
886
887	/* Create the initial 1GB replicated page tables */
888	for (i = 0; i < 512; i++) {
889		/* Each slot of the level 4 pages points to the same level 3 page */
890		pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE);
891		pt4[i] |= PG_V | PG_RW | PG_U;
892
893		/* Each slot of the level 3 pages points to the same level 2 page */
894		pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE));
895		pt3[i] |= PG_V | PG_RW | PG_U;
896
897		/* The level 2 page slots are mapped with 2MB pages for 1GB. */
898		pt2[i] = i * (2 * 1024 * 1024);
899		pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
900	}
901
902	/* save the current value of the warm-start vector */
903	mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF);
904	outb(CMOS_REG, BIOS_RESET);
905	mpbiosreason = inb(CMOS_DATA);
906
907	/* setup a vector to our boot code */
908	*((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
909	*((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4);
910	outb(CMOS_REG, BIOS_RESET);
911	outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
912
913	/* start each AP */
914	for (cpu = 1; cpu < mp_ncpus; cpu++) {
915		apic_id = cpu_apic_ids[cpu];
916
917		/* allocate and set up an idle stack data page */
918		bootstacks[cpu] = (void *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE);
919		doublefault_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE);
920		nmi_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE);
921		dpcpu = (void *)kmem_alloc(kernel_map, DPCPU_SIZE);
922
923		bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8;
924		bootAP = cpu;
925
926		/* attempt to start the Application Processor */
927		if (!start_ap(apic_id)) {
928			/* restore the warmstart vector */
929			*(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
930			panic("AP #%d (PHY# %d) failed!", cpu, apic_id);
931		}
932
933		all_cpus |= (1 << cpu);		/* record AP in CPU map */
934	}
935
936	/* build our map of 'other' CPUs */
937	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
938
939	/* restore the warmstart vector */
940	*(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
941
942	outb(CMOS_REG, BIOS_RESET);
943	outb(CMOS_DATA, mpbiosreason);
944
945	/* number of APs actually started */
946	return mp_naps;
947}
948
949
950/*
951 * This function starts the AP (application processor) identified
952 * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
953 * to accomplish this.  This is necessary because of the nuances
954 * of the different hardware we might encounter.  It isn't pretty,
955 * but it seems to work.
956 */
957static int
958start_ap(int apic_id)
959{
960	int vector, ms;
961	int cpus;
962
963	/* calculate the vector */
964	vector = (boot_address >> 12) & 0xff;
965
966	/* used as a watchpoint to signal AP startup */
967	cpus = mp_naps;
968
969	/*
970	 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
971	 * and running the target CPU. OR this INIT IPI might be latched (P5
972	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
973	 * ignored.
974	 */
975
976	/* do an INIT IPI: assert RESET */
977	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
978	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
979
980	/* wait for pending status end */
981	lapic_ipi_wait(-1);
982
983	/* do an INIT IPI: deassert RESET */
984	lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL |
985	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0);
986
987	/* wait for pending status end */
988	DELAY(10000);		/* wait ~10mS */
989	lapic_ipi_wait(-1);
990
991	/*
992	 * next we do a STARTUP IPI: the previous INIT IPI might still be
993	 * latched, (P5 bug) this 1st STARTUP would then terminate
994	 * immediately, and the previously started INIT IPI would continue. OR
995	 * the previous INIT IPI has already run. and this STARTUP IPI will
996	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
997	 * will run.
998	 */
999
1000	/* do a STARTUP IPI */
1001	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
1002	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
1003	    vector, apic_id);
1004	lapic_ipi_wait(-1);
1005	DELAY(200);		/* wait ~200uS */
1006
1007	/*
1008	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
1009	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
1010	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
1011	 * recognized after hardware RESET or INIT IPI.
1012	 */
1013
1014	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
1015	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
1016	    vector, apic_id);
1017	lapic_ipi_wait(-1);
1018	DELAY(200);		/* wait ~200uS */
1019
1020	/* Wait up to 5 seconds for it to start. */
1021	for (ms = 0; ms < 5000; ms++) {
1022		if (mp_naps > cpus)
1023			return 1;	/* return SUCCESS */
1024		DELAY(1000);
1025	}
1026	return 0;		/* return FAILURE */
1027}
1028
1029#ifdef COUNT_XINVLTLB_HITS
1030u_int xhits_gbl[MAXCPU];
1031u_int xhits_pg[MAXCPU];
1032u_int xhits_rng[MAXCPU];
1033SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
1034SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
1035    sizeof(xhits_gbl), "IU", "");
1036SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
1037    sizeof(xhits_pg), "IU", "");
1038SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
1039    sizeof(xhits_rng), "IU", "");
1040
1041u_int ipi_global;
1042u_int ipi_page;
1043u_int ipi_range;
1044u_int ipi_range_size;
1045SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
1046SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
1047SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
1048SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
1049    0, "");
1050
1051u_int ipi_masked_global;
1052u_int ipi_masked_page;
1053u_int ipi_masked_range;
1054u_int ipi_masked_range_size;
1055SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
1056    &ipi_masked_global, 0, "");
1057SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
1058    &ipi_masked_page, 0, "");
1059SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
1060    &ipi_masked_range, 0, "");
1061SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
1062    &ipi_masked_range_size, 0, "");
1063#endif /* COUNT_XINVLTLB_HITS */
1064
1065/*
1066 * Flush the TLB on all other CPU's
1067 */
1068static void
1069smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
1070{
1071	u_int ncpu;
1072
1073	ncpu = mp_ncpus - 1;	/* does not shootdown self */
1074	if (ncpu < 1)
1075		return;		/* no other cpus */
1076	if (!(read_rflags() & PSL_I))
1077		panic("%s: interrupts disabled", __func__);
1078	mtx_lock_spin(&smp_ipi_mtx);
1079	smp_tlb_addr1 = addr1;
1080	smp_tlb_addr2 = addr2;
1081	atomic_store_rel_int(&smp_tlb_wait, 0);
1082	ipi_all_but_self(vector);
1083	while (smp_tlb_wait < ncpu)
1084		ia32_pause();
1085	mtx_unlock_spin(&smp_ipi_mtx);
1086}
1087
1088static void
1089smp_targeted_tlb_shootdown(cpumask_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
1090{
1091	int ncpu, othercpus;
1092
1093	othercpus = mp_ncpus - 1;
1094	if (mask == (cpumask_t)-1) {
1095		ncpu = othercpus;
1096		if (ncpu < 1)
1097			return;
1098	} else {
1099		mask &= ~PCPU_GET(cpumask);
1100		if (mask == 0)
1101			return;
1102		ncpu = bitcount32(mask);
1103		if (ncpu > othercpus) {
1104			/* XXX this should be a panic offence */
1105			printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
1106			    ncpu, othercpus);
1107			ncpu = othercpus;
1108		}
1109		/* XXX should be a panic, implied by mask == 0 above */
1110		if (ncpu < 1)
1111			return;
1112	}
1113	if (!(read_rflags() & PSL_I))
1114		panic("%s: interrupts disabled", __func__);
1115	mtx_lock_spin(&smp_ipi_mtx);
1116	smp_tlb_addr1 = addr1;
1117	smp_tlb_addr2 = addr2;
1118	atomic_store_rel_int(&smp_tlb_wait, 0);
1119	if (mask == (cpumask_t)-1)
1120		ipi_all_but_self(vector);
1121	else
1122		ipi_selected(mask, vector);
1123	while (smp_tlb_wait < ncpu)
1124		ia32_pause();
1125	mtx_unlock_spin(&smp_ipi_mtx);
1126}
1127
1128/*
1129 * Send an IPI to specified CPU handling the bitmap logic.
1130 */
1131static void
1132ipi_send_cpu(int cpu, u_int ipi)
1133{
1134	u_int bitmap, old_pending, new_pending;
1135
1136	KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu));
1137
1138	if (IPI_IS_BITMAPED(ipi)) {
1139		bitmap = 1 << ipi;
1140		ipi = IPI_BITMAP_VECTOR;
1141		do {
1142			old_pending = cpu_ipi_pending[cpu];
1143			new_pending = old_pending | bitmap;
1144		} while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],
1145		    old_pending, new_pending));
1146		if (old_pending)
1147			return;
1148	}
1149	lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
1150}
1151
1152void
1153smp_cache_flush(void)
1154{
1155
1156	if (smp_started)
1157		smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
1158}
1159
1160void
1161smp_invltlb(void)
1162{
1163
1164	if (smp_started) {
1165		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
1166#ifdef COUNT_XINVLTLB_HITS
1167		ipi_global++;
1168#endif
1169	}
1170}
1171
1172void
1173smp_invlpg(vm_offset_t addr)
1174{
1175
1176	if (smp_started) {
1177		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
1178#ifdef COUNT_XINVLTLB_HITS
1179		ipi_page++;
1180#endif
1181	}
1182}
1183
1184void
1185smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
1186{
1187
1188	if (smp_started) {
1189		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
1190#ifdef COUNT_XINVLTLB_HITS
1191		ipi_range++;
1192		ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
1193#endif
1194	}
1195}
1196
1197void
1198smp_masked_invltlb(cpumask_t mask)
1199{
1200
1201	if (smp_started) {
1202		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
1203#ifdef COUNT_XINVLTLB_HITS
1204		ipi_masked_global++;
1205#endif
1206	}
1207}
1208
1209void
1210smp_masked_invlpg(cpumask_t mask, vm_offset_t addr)
1211{
1212
1213	if (smp_started) {
1214		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
1215#ifdef COUNT_XINVLTLB_HITS
1216		ipi_masked_page++;
1217#endif
1218	}
1219}
1220
1221void
1222smp_masked_invlpg_range(cpumask_t mask, vm_offset_t addr1, vm_offset_t addr2)
1223{
1224
1225	if (smp_started) {
1226		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
1227#ifdef COUNT_XINVLTLB_HITS
1228		ipi_masked_range++;
1229		ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
1230#endif
1231	}
1232}
1233
1234void
1235ipi_bitmap_handler(struct trapframe frame)
1236{
1237	struct trapframe *oldframe;
1238	struct thread *td;
1239	int cpu = PCPU_GET(cpuid);
1240	u_int ipi_bitmap;
1241
1242	critical_enter();
1243	td = curthread;
1244	td->td_intr_nesting_level++;
1245	oldframe = td->td_intr_frame;
1246	td->td_intr_frame = &frame;
1247	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
1248	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
1249#ifdef COUNT_IPIS
1250		(*ipi_preempt_counts[cpu])++;
1251#endif
1252		sched_preempt(td);
1253	}
1254	if (ipi_bitmap & (1 << IPI_AST)) {
1255#ifdef COUNT_IPIS
1256		(*ipi_ast_counts[cpu])++;
1257#endif
1258		/* Nothing to do for AST */
1259	}
1260	if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
1261#ifdef COUNT_IPIS
1262		(*ipi_hardclock_counts[cpu])++;
1263#endif
1264		hardclockintr();
1265	}
1266	td->td_intr_frame = oldframe;
1267	td->td_intr_nesting_level--;
1268	critical_exit();
1269}
1270
1271/*
1272 * send an IPI to a set of cpus.
1273 */
1274void
1275ipi_selected(cpumask_t cpus, u_int ipi)
1276{
1277	int cpu;
1278
1279	/*
1280	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1281	 * of help in order to understand what is the source.
1282	 * Set the mask of receiving CPUs for this purpose.
1283	 */
1284	if (ipi == IPI_STOP_HARD)
1285		atomic_set_int(&ipi_nmi_pending, cpus);
1286
1287	CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi);
1288	while ((cpu = ffs(cpus)) != 0) {
1289		cpu--;
1290		cpus &= ~(1 << cpu);
1291		ipi_send_cpu(cpu, ipi);
1292	}
1293}
1294
1295/*
1296 * send an IPI to a specific CPU.
1297 */
1298void
1299ipi_cpu(int cpu, u_int ipi)
1300{
1301
1302	/*
1303	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1304	 * of help in order to understand what is the source.
1305	 * Set the mask of receiving CPUs for this purpose.
1306	 */
1307	if (ipi == IPI_STOP_HARD)
1308		atomic_set_int(&ipi_nmi_pending, 1 << cpu);
1309
1310	CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1311	ipi_send_cpu(cpu, ipi);
1312}
1313
1314/*
1315 * send an IPI to all CPUs EXCEPT myself
1316 */
1317void
1318ipi_all_but_self(u_int ipi)
1319{
1320
1321	if (IPI_IS_BITMAPED(ipi)) {
1322		ipi_selected(PCPU_GET(other_cpus), ipi);
1323		return;
1324	}
1325
1326	/*
1327	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1328	 * of help in order to understand what is the source.
1329	 * Set the mask of receiving CPUs for this purpose.
1330	 */
1331	if (ipi == IPI_STOP_HARD)
1332		atomic_set_int(&ipi_nmi_pending, PCPU_GET(other_cpus));
1333
1334	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1335	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
1336}
1337
1338int
1339ipi_nmi_handler()
1340{
1341	cpumask_t cpumask;
1342
1343	/*
1344	 * As long as there is not a simple way to know about a NMI's
1345	 * source, if the bitmask for the current CPU is present in
1346	 * the global pending bitword an IPI_STOP_HARD has been issued
1347	 * and should be handled.
1348	 */
1349	cpumask = PCPU_GET(cpumask);
1350	if ((ipi_nmi_pending & cpumask) == 0)
1351		return (1);
1352
1353	atomic_clear_int(&ipi_nmi_pending, cpumask);
1354	cpustop_handler();
1355	return (0);
1356}
1357
1358/*
1359 * Handle an IPI_STOP by saving our current context and spinning until we
1360 * are resumed.
1361 */
1362void
1363cpustop_handler(void)
1364{
1365	cpumask_t cpumask;
1366	u_int cpu;
1367
1368	cpu = PCPU_GET(cpuid);
1369	cpumask = PCPU_GET(cpumask);
1370
1371	savectx(&stoppcbs[cpu]);
1372
1373	/* Indicate that we are stopped */
1374	atomic_set_int(&stopped_cpus, cpumask);
1375
1376	/* Wait for restart */
1377	while (!(started_cpus & cpumask))
1378	    ia32_pause();
1379
1380	atomic_clear_int(&started_cpus, cpumask);
1381	atomic_clear_int(&stopped_cpus, cpumask);
1382
1383	if (cpu == 0 && cpustop_restartfunc != NULL) {
1384		cpustop_restartfunc();
1385		cpustop_restartfunc = NULL;
1386	}
1387}
1388
1389/*
1390 * Handle an IPI_SUSPEND by saving our current context and spinning until we
1391 * are resumed.
1392 */
1393void
1394cpususpend_handler(void)
1395{
1396	cpumask_t cpumask;
1397	register_t cr3, rf;
1398	u_int cpu;
1399
1400	cpu = PCPU_GET(cpuid);
1401	cpumask = PCPU_GET(cpumask);
1402
1403	rf = intr_disable();
1404	cr3 = rcr3();
1405
1406	if (savectx(susppcbs[cpu])) {
1407		wbinvd();
1408		atomic_set_int(&stopped_cpus, cpumask);
1409	} else {
1410		PCPU_SET(switchtime, 0);
1411		PCPU_SET(switchticks, ticks);
1412	}
1413
1414	/* Wait for resume */
1415	while (!(started_cpus & cpumask))
1416		ia32_pause();
1417
1418	atomic_clear_int(&started_cpus, cpumask);
1419	atomic_clear_int(&stopped_cpus, cpumask);
1420
1421	/* Restore CR3 and enable interrupts */
1422	load_cr3(cr3);
1423	mca_resume();
1424	lapic_setup(0);
1425	intr_restore(rf);
1426}
1427
1428/*
1429 * This is called once the rest of the system is up and running and we're
1430 * ready to let the AP's out of the pen.
1431 */
1432static void
1433release_aps(void *dummy __unused)
1434{
1435
1436	if (mp_ncpus == 1)
1437		return;
1438	atomic_store_rel_int(&aps_ready, 1);
1439	while (smp_started == 0)
1440		ia32_pause();
1441}
1442SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1443
1444static int
1445sysctl_hlt_cpus(SYSCTL_HANDLER_ARGS)
1446{
1447	cpumask_t mask;
1448	int error;
1449
1450	mask = hlt_cpus_mask;
1451	error = sysctl_handle_int(oidp, &mask, 0, req);
1452	if (error || !req->newptr)
1453		return (error);
1454
1455	if (logical_cpus_mask != 0 &&
1456	    (mask & logical_cpus_mask) == logical_cpus_mask)
1457		hlt_logical_cpus = 1;
1458	else
1459		hlt_logical_cpus = 0;
1460
1461	if (! hyperthreading_allowed)
1462		mask |= hyperthreading_cpus_mask;
1463
1464	if ((mask & all_cpus) == all_cpus)
1465		mask &= ~(1<<0);
1466	hlt_cpus_mask = mask;
1467	return (error);
1468}
1469SYSCTL_PROC(_machdep, OID_AUTO, hlt_cpus, CTLTYPE_INT|CTLFLAG_RW,
1470    0, 0, sysctl_hlt_cpus, "IU",
1471    "Bitmap of CPUs to halt.  101 (binary) will halt CPUs 0 and 2.");
1472
1473static int
1474sysctl_hlt_logical_cpus(SYSCTL_HANDLER_ARGS)
1475{
1476	int disable, error;
1477
1478	disable = hlt_logical_cpus;
1479	error = sysctl_handle_int(oidp, &disable, 0, req);
1480	if (error || !req->newptr)
1481		return (error);
1482
1483	if (disable)
1484		hlt_cpus_mask |= logical_cpus_mask;
1485	else
1486		hlt_cpus_mask &= ~logical_cpus_mask;
1487
1488	if (! hyperthreading_allowed)
1489		hlt_cpus_mask |= hyperthreading_cpus_mask;
1490
1491	if ((hlt_cpus_mask & all_cpus) == all_cpus)
1492		hlt_cpus_mask &= ~(1<<0);
1493
1494	hlt_logical_cpus = disable;
1495	return (error);
1496}
1497
1498static int
1499sysctl_hyperthreading_allowed(SYSCTL_HANDLER_ARGS)
1500{
1501	int allowed, error;
1502
1503	allowed = hyperthreading_allowed;
1504	error = sysctl_handle_int(oidp, &allowed, 0, req);
1505	if (error || !req->newptr)
1506		return (error);
1507
1508#ifdef SCHED_ULE
1509	/*
1510	 * SCHED_ULE doesn't allow enabling/disabling HT cores at
1511	 * run-time.
1512	 */
1513	if (allowed != hyperthreading_allowed)
1514		return (ENOTSUP);
1515	return (error);
1516#endif
1517
1518	if (allowed)
1519		hlt_cpus_mask &= ~hyperthreading_cpus_mask;
1520	else
1521		hlt_cpus_mask |= hyperthreading_cpus_mask;
1522
1523	if (logical_cpus_mask != 0 &&
1524	    (hlt_cpus_mask & logical_cpus_mask) == logical_cpus_mask)
1525		hlt_logical_cpus = 1;
1526	else
1527		hlt_logical_cpus = 0;
1528
1529	if ((hlt_cpus_mask & all_cpus) == all_cpus)
1530		hlt_cpus_mask &= ~(1<<0);
1531
1532	hyperthreading_allowed = allowed;
1533	return (error);
1534}
1535
1536static void
1537cpu_hlt_setup(void *dummy __unused)
1538{
1539
1540	if (logical_cpus_mask != 0) {
1541		TUNABLE_INT_FETCH("machdep.hlt_logical_cpus",
1542		    &hlt_logical_cpus);
1543		sysctl_ctx_init(&logical_cpu_clist);
1544		SYSCTL_ADD_PROC(&logical_cpu_clist,
1545		    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1546		    "hlt_logical_cpus", CTLTYPE_INT|CTLFLAG_RW, 0, 0,
1547		    sysctl_hlt_logical_cpus, "IU", "");
1548		SYSCTL_ADD_UINT(&logical_cpu_clist,
1549		    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1550		    "logical_cpus_mask", CTLTYPE_INT|CTLFLAG_RD,
1551		    &logical_cpus_mask, 0, "");
1552
1553		if (hlt_logical_cpus)
1554			hlt_cpus_mask |= logical_cpus_mask;
1555
1556		/*
1557		 * If necessary for security purposes, force
1558		 * hyperthreading off, regardless of the value
1559		 * of hlt_logical_cpus.
1560		 */
1561		if (hyperthreading_cpus_mask) {
1562			SYSCTL_ADD_PROC(&logical_cpu_clist,
1563			    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1564			    "hyperthreading_allowed", CTLTYPE_INT|CTLFLAG_RW,
1565			    0, 0, sysctl_hyperthreading_allowed, "IU", "");
1566			if (! hyperthreading_allowed)
1567				hlt_cpus_mask |= hyperthreading_cpus_mask;
1568		}
1569	}
1570}
1571SYSINIT(cpu_hlt, SI_SUB_SMP, SI_ORDER_ANY, cpu_hlt_setup, NULL);
1572
1573int
1574mp_grab_cpu_hlt(void)
1575{
1576	cpumask_t mask;
1577#ifdef MP_WATCHDOG
1578	u_int cpuid;
1579#endif
1580	int retval;
1581
1582	mask = PCPU_GET(cpumask);
1583#ifdef MP_WATCHDOG
1584	cpuid = PCPU_GET(cpuid);
1585	ap_watchdog(cpuid);
1586#endif
1587
1588	retval = 0;
1589	while (mask & hlt_cpus_mask) {
1590		retval = 1;
1591		__asm __volatile("sti; hlt" : : : "memory");
1592	}
1593	return (retval);
1594}
1595
1596#ifdef COUNT_IPIS
1597/*
1598 * Setup interrupt counters for IPI handlers.
1599 */
1600static void
1601mp_ipi_intrcnt(void *dummy)
1602{
1603	char buf[64];
1604	int i;
1605
1606	CPU_FOREACH(i) {
1607		snprintf(buf, sizeof(buf), "cpu%d:invltlb", i);
1608		intrcnt_add(buf, &ipi_invltlb_counts[i]);
1609		snprintf(buf, sizeof(buf), "cpu%d:invlrng", i);
1610		intrcnt_add(buf, &ipi_invlrng_counts[i]);
1611		snprintf(buf, sizeof(buf), "cpu%d:invlpg", i);
1612		intrcnt_add(buf, &ipi_invlpg_counts[i]);
1613		snprintf(buf, sizeof(buf), "cpu%d:preempt", i);
1614		intrcnt_add(buf, &ipi_preempt_counts[i]);
1615		snprintf(buf, sizeof(buf), "cpu%d:ast", i);
1616		intrcnt_add(buf, &ipi_ast_counts[i]);
1617		snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i);
1618		intrcnt_add(buf, &ipi_rendezvous_counts[i]);
1619		snprintf(buf, sizeof(buf), "cpu%d:lazypmap", i);
1620		intrcnt_add(buf, &ipi_lazypmap_counts[i]);
1621		snprintf(buf, sizeof(buf), "cpu%d:hardclock", i);
1622		intrcnt_add(buf, &ipi_hardclock_counts[i]);
1623	}
1624}
1625SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
1626#endif
1627
1628