mp_machdep.c revision 194784
1/*-
2 * Copyright (c) 1996, by Steve Passe
3 * Copyright (c) 2003, by Peter Wemm
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. The name of the developer may NOT be used to endorse or promote products
12 *    derived from this software without specific prior written permission.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/amd64/amd64/mp_machdep.c 194784 2009-06-23 22:42:39Z jeff $");
29
30#include "opt_cpu.h"
31#include "opt_kstack_pages.h"
32#include "opt_mp_watchdog.h"
33#include "opt_sched.h"
34
35#include <sys/param.h>
36#include <sys/systm.h>
37#include <sys/bus.h>
38#ifdef GPROF
39#include <sys/gmon.h>
40#endif
41#include <sys/kernel.h>
42#include <sys/ktr.h>
43#include <sys/lock.h>
44#include <sys/malloc.h>
45#include <sys/memrange.h>
46#include <sys/mutex.h>
47#include <sys/pcpu.h>
48#include <sys/proc.h>
49#include <sys/sched.h>
50#include <sys/smp.h>
51#include <sys/sysctl.h>
52
53#include <vm/vm.h>
54#include <vm/vm_param.h>
55#include <vm/pmap.h>
56#include <vm/vm_kern.h>
57#include <vm/vm_extern.h>
58
59#include <machine/apicreg.h>
60#include <machine/clock.h>
61#include <machine/cputypes.h>
62#include <machine/cpufunc.h>
63#include <machine/mca.h>
64#include <machine/md_var.h>
65#include <machine/mp_watchdog.h>
66#include <machine/pcb.h>
67#include <machine/psl.h>
68#include <machine/smp.h>
69#include <machine/specialreg.h>
70#include <machine/tss.h>
71
72#define WARMBOOT_TARGET		0
73#define WARMBOOT_OFF		(KERNBASE + 0x0467)
74#define WARMBOOT_SEG		(KERNBASE + 0x0469)
75
76#define CMOS_REG		(0x70)
77#define CMOS_DATA		(0x71)
78#define BIOS_RESET		(0x0f)
79#define BIOS_WARM		(0x0a)
80
81/* lock region used by kernel profiling */
82int	mcount_lock;
83
84int	mp_naps;		/* # of Applications processors */
85int	boot_cpu_id = -1;	/* designated BSP */
86
87extern  struct pcpu __pcpu[];
88
89/* AP uses this during bootstrap.  Do not staticize.  */
90char *bootSTK;
91static int bootAP;
92
93/* Free these after use */
94void *bootstacks[MAXCPU];
95
96/* Temporary variables for init_secondary()  */
97char *doublefault_stack;
98char *nmi_stack;
99void *dpcpu;
100
101/* Hotwire a 0->4MB V==P mapping */
102extern pt_entry_t *KPTphys;
103
104/* SMP page table page */
105extern pt_entry_t *SMPpt;
106
107struct pcb stoppcbs[MAXCPU];
108struct xpcb *stopxpcbs = NULL;
109
110/* Variables needed for SMP tlb shootdown. */
111vm_offset_t smp_tlb_addr1;
112vm_offset_t smp_tlb_addr2;
113volatile int smp_tlb_wait;
114
115extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
116
117#ifdef STOP_NMI
118static volatile cpumask_t ipi_nmi_pending;
119
120static void	ipi_nmi_selected(cpumask_t cpus);
121#endif
122
123/*
124 * Local data and functions.
125 */
126
127#ifdef STOP_NMI
128/*
129 * Provide an alternate method of stopping other CPUs. If another CPU has
130 * disabled interrupts the conventional STOP IPI will be blocked. This
131 * NMI-based stop should get through in that case.
132 */
133static int stop_cpus_with_nmi = 1;
134SYSCTL_INT(_debug, OID_AUTO, stop_cpus_with_nmi, CTLTYPE_INT | CTLFLAG_RW,
135    &stop_cpus_with_nmi, 0, "");
136TUNABLE_INT("debug.stop_cpus_with_nmi", &stop_cpus_with_nmi);
137#else
138#define	stop_cpus_with_nmi	0
139#endif
140
141static u_int logical_cpus;
142
143/* used to hold the AP's until we are ready to release them */
144static struct mtx ap_boot_mtx;
145
146/* Set to 1 once we're ready to let the APs out of the pen. */
147static volatile int aps_ready = 0;
148
149/*
150 * Store data from cpu_add() until later in the boot when we actually setup
151 * the APs.
152 */
153struct cpu_info {
154	int	cpu_present:1;
155	int	cpu_bsp:1;
156	int	cpu_disabled:1;
157	int	cpu_hyperthread:1;
158} static cpu_info[MAX_APIC_ID + 1];
159int cpu_apic_ids[MAXCPU];
160int apic_cpuids[MAX_APIC_ID + 1];
161
162/* Holds pending bitmap based IPIs per CPU */
163static volatile u_int cpu_ipi_pending[MAXCPU];
164
165static u_int boot_address;
166static int cpu_logical;
167static int cpu_cores;
168
169static void	assign_cpu_ids(void);
170static void	set_interrupt_apic_ids(void);
171static int	start_all_aps(void);
172static int	start_ap(int apic_id);
173static void	release_aps(void *dummy);
174
175static int	hlt_logical_cpus;
176static u_int	hyperthreading_cpus;
177static cpumask_t	hyperthreading_cpus_mask;
178static int	hyperthreading_allowed = 1;
179static struct	sysctl_ctx_list logical_cpu_clist;
180static u_int	bootMP_size;
181
182static void
183mem_range_AP_init(void)
184{
185	if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
186		mem_range_softc.mr_op->initAP(&mem_range_softc);
187}
188
189static void
190topo_probe_0xb(void)
191{
192	int logical;
193	int p[4];
194	int bits;
195	int type;
196	int cnt;
197	int i;
198	int x;
199
200	/* We only support two levels for now. */
201	for (i = 0; i < 3; i++) {
202		cpuid_count(0x0B, i, p);
203		bits = p[0] & 0x1f;
204		logical = p[1] &= 0xffff;
205		type = (p[2] >> 8) & 0xff;
206		if (type == 0 || logical == 0)
207			break;
208		for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) {
209			if (!cpu_info[x].cpu_present ||
210			    cpu_info[x].cpu_disabled)
211				continue;
212			if (x >> bits == boot_cpu_id >> bits)
213				cnt++;
214		}
215		if (type == CPUID_TYPE_SMT)
216			cpu_logical = cnt;
217		else if (type == CPUID_TYPE_CORE)
218			cpu_cores = cnt;
219	}
220	if (cpu_logical == 0)
221		cpu_logical = 1;
222	cpu_cores /= cpu_logical;
223}
224
225static void
226topo_probe_0x4(void)
227{
228	u_int threads_per_cache, p[4];
229	u_int htt, cmp;
230	int i;
231
232	htt = cmp = 1;
233	/*
234	 * If this CPU supports HTT or CMP then mention the
235	 * number of physical/logical cores it contains.
236	 */
237	if (cpu_feature & CPUID_HTT)
238		htt = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
239	if (cpu_vendor_id == CPU_VENDOR_AMD && (amd_feature2 & AMDID2_CMP))
240		cmp = (cpu_procinfo2 & AMDID_CMP_CORES) + 1;
241	else if (cpu_vendor_id == CPU_VENDOR_INTEL && (cpu_high >= 4)) {
242		cpuid_count(4, 0, p);
243		if ((p[0] & 0x1f) != 0)
244			cmp = ((p[0] >> 26) & 0x3f) + 1;
245	}
246	cpu_cores = cmp;
247	cpu_logical = htt / cmp;
248
249	/* Setup the initial logical CPUs info. */
250	if (cpu_feature & CPUID_HTT)
251		logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
252
253	/*
254	 * Work out if hyperthreading is *really* enabled.  This
255	 * is made really ugly by the fact that processors lie: Dual
256	 * core processors claim to be hyperthreaded even when they're
257	 * not, presumably because they want to be treated the same
258	 * way as HTT with respect to per-cpu software licensing.
259	 * At the time of writing (May 12, 2005) the only hyperthreaded
260	 * cpus are from Intel, and Intel's dual-core processors can be
261	 * identified via the "deterministic cache parameters" cpuid
262	 * calls.
263	 */
264	/*
265	 * First determine if this is an Intel processor which claims
266	 * to have hyperthreading support.
267	 */
268	if ((cpu_feature & CPUID_HTT) && cpu_vendor_id == CPU_VENDOR_INTEL) {
269		/*
270		 * If the "deterministic cache parameters" cpuid calls
271		 * are available, use them.
272		 */
273		if (cpu_high >= 4) {
274			/* Ask the processor about the L1 cache. */
275			for (i = 0; i < 1; i++) {
276				cpuid_count(4, i, p);
277				threads_per_cache = ((p[0] & 0x3ffc000) >> 14) + 1;
278				if (hyperthreading_cpus < threads_per_cache)
279					hyperthreading_cpus = threads_per_cache;
280				if ((p[0] & 0x1f) == 0)
281					break;
282			}
283		}
284
285		/*
286		 * If the deterministic cache parameters are not
287		 * available, or if no caches were reported to exist,
288		 * just accept what the HTT flag indicated.
289		 */
290		if (hyperthreading_cpus == 0)
291			hyperthreading_cpus = logical_cpus;
292	}
293}
294
295static void
296topo_probe(void)
297{
298	static int cpu_topo_probed = 0;
299
300	if (cpu_topo_probed)
301		return;
302
303	logical_cpus = logical_cpus_mask = 0;
304	if (cpu_high >= 0xb)
305		topo_probe_0xb();
306	else if (cpu_high)
307		topo_probe_0x4();
308	if (cpu_cores == 0)
309		cpu_cores = mp_ncpus > 0 ? mp_ncpus : 1;
310	if (cpu_logical == 0)
311		cpu_logical = 1;
312	cpu_topo_probed = 1;
313}
314
315struct cpu_group *
316cpu_topo(void)
317{
318	int cg_flags;
319
320	/*
321	 * Determine whether any threading flags are
322	 * necessry.
323	 */
324	topo_probe();
325	if (cpu_logical > 1 && hyperthreading_cpus)
326		cg_flags = CG_FLAG_HTT;
327	else if (cpu_logical > 1)
328		cg_flags = CG_FLAG_SMT;
329	else
330		cg_flags = 0;
331	if (mp_ncpus % (cpu_cores * cpu_logical) != 0) {
332		printf("WARNING: Non-uniform processors.\n");
333		printf("WARNING: Using suboptimal topology.\n");
334		return (smp_topo_none());
335	}
336	/*
337	 * No multi-core or hyper-threaded.
338	 */
339	if (cpu_logical * cpu_cores == 1)
340		return (smp_topo_none());
341	/*
342	 * Only HTT no multi-core.
343	 */
344	if (cpu_logical > 1 && cpu_cores == 1)
345		return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags));
346	/*
347	 * Only multi-core no HTT.
348	 */
349	if (cpu_cores > 1 && cpu_logical == 1)
350		return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags));
351	/*
352	 * Both HTT and multi-core.
353	 */
354	return (smp_topo_2level(CG_SHARE_L2, cpu_cores,
355	    CG_SHARE_L1, cpu_logical, cg_flags));
356}
357
358/*
359 * Calculate usable address in base memory for AP trampoline code.
360 */
361u_int
362mp_bootaddress(u_int basemem)
363{
364
365	bootMP_size = mptramp_end - mptramp_start;
366	boot_address = trunc_page(basemem * 1024); /* round down to 4k boundary */
367	if (((basemem * 1024) - boot_address) < bootMP_size)
368		boot_address -= PAGE_SIZE;	/* not enough, lower by 4k */
369	/* 3 levels of page table pages */
370	mptramp_pagetables = boot_address - (PAGE_SIZE * 3);
371
372	return mptramp_pagetables;
373}
374
375void
376cpu_add(u_int apic_id, char boot_cpu)
377{
378
379	if (apic_id > MAX_APIC_ID) {
380		panic("SMP: APIC ID %d too high", apic_id);
381		return;
382	}
383	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
384	    apic_id));
385	cpu_info[apic_id].cpu_present = 1;
386	if (boot_cpu) {
387		KASSERT(boot_cpu_id == -1,
388		    ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
389		    boot_cpu_id));
390		boot_cpu_id = apic_id;
391		cpu_info[apic_id].cpu_bsp = 1;
392	}
393	if (mp_ncpus < MAXCPU) {
394		mp_ncpus++;
395		mp_maxid = mp_ncpus -1;
396	}
397	if (bootverbose)
398		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
399		    "AP");
400}
401
402void
403cpu_mp_setmaxid(void)
404{
405
406	/*
407	 * mp_maxid should be already set by calls to cpu_add().
408	 * Just sanity check its value here.
409	 */
410	if (mp_ncpus == 0)
411		KASSERT(mp_maxid == 0,
412		    ("%s: mp_ncpus is zero, but mp_maxid is not", __func__));
413	else if (mp_ncpus == 1)
414		mp_maxid = 0;
415	else
416		KASSERT(mp_maxid >= mp_ncpus - 1,
417		    ("%s: counters out of sync: max %d, count %d", __func__,
418			mp_maxid, mp_ncpus));
419}
420
421int
422cpu_mp_probe(void)
423{
424
425	/*
426	 * Always record BSP in CPU map so that the mbuf init code works
427	 * correctly.
428	 */
429	all_cpus = 1;
430	if (mp_ncpus == 0) {
431		/*
432		 * No CPUs were found, so this must be a UP system.  Setup
433		 * the variables to represent a system with a single CPU
434		 * with an id of 0.
435		 */
436		mp_ncpus = 1;
437		return (0);
438	}
439
440	/* At least one CPU was found. */
441	if (mp_ncpus == 1) {
442		/*
443		 * One CPU was found, so this must be a UP system with
444		 * an I/O APIC.
445		 */
446		mp_maxid = 0;
447		return (0);
448	}
449
450	/* At least two CPUs were found. */
451	return (1);
452}
453
454/*
455 * Initialize the IPI handlers and start up the AP's.
456 */
457void
458cpu_mp_start(void)
459{
460	int i;
461
462	/* Initialize the logical ID to APIC ID table. */
463	for (i = 0; i < MAXCPU; i++) {
464		cpu_apic_ids[i] = -1;
465		cpu_ipi_pending[i] = 0;
466	}
467
468	/* Install an inter-CPU IPI for TLB invalidation */
469	setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0);
470	setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0);
471	setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0);
472
473	/* Install an inter-CPU IPI for cache invalidation. */
474	setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYSIGT, SEL_KPL, 0);
475
476	/* Install an inter-CPU IPI for all-CPU rendezvous */
477	setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0);
478
479	/* Install generic inter-CPU IPI handler */
480	setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler),
481	       SDT_SYSIGT, SEL_KPL, 0);
482
483	/* Install an inter-CPU IPI for CPU stop/restart */
484	setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0);
485
486	/* Install an inter-CPU IPI for CPU suspend/resume */
487	setidt(IPI_SUSPEND, IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0);
488
489	/* Set boot_cpu_id if needed. */
490	if (boot_cpu_id == -1) {
491		boot_cpu_id = PCPU_GET(apic_id);
492		cpu_info[boot_cpu_id].cpu_bsp = 1;
493	} else
494		KASSERT(boot_cpu_id == PCPU_GET(apic_id),
495		    ("BSP's APIC ID doesn't match boot_cpu_id"));
496
497	/* Probe logical/physical core configuration. */
498	topo_probe();
499
500	assign_cpu_ids();
501
502	/* Start each Application Processor */
503	start_all_aps();
504
505	set_interrupt_apic_ids();
506}
507
508
509/*
510 * Print various information about the SMP system hardware and setup.
511 */
512void
513cpu_mp_announce(void)
514{
515	const char *hyperthread;
516	int i;
517
518	printf("FreeBSD/SMP: %d package(s) x %d core(s)",
519	    mp_ncpus / (cpu_cores * cpu_logical), cpu_cores);
520	if (hyperthreading_cpus > 1)
521	    printf(" x %d HTT threads", cpu_logical);
522	else if (cpu_logical > 1)
523	    printf(" x %d SMT threads", cpu_logical);
524	printf("\n");
525
526	/* List active CPUs first. */
527	printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
528	for (i = 1; i < mp_ncpus; i++) {
529		if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread)
530			hyperthread = "/HT";
531		else
532			hyperthread = "";
533		printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread,
534		    cpu_apic_ids[i]);
535	}
536
537	/* List disabled CPUs last. */
538	for (i = 0; i <= MAX_APIC_ID; i++) {
539		if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled)
540			continue;
541		if (cpu_info[i].cpu_hyperthread)
542			hyperthread = "/HT";
543		else
544			hyperthread = "";
545		printf("  cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread,
546		    i);
547	}
548}
549
550/*
551 * AP CPU's call this to initialize themselves.
552 */
553void
554init_secondary(void)
555{
556	struct pcpu *pc;
557	struct nmi_pcpu *np;
558	u_int64_t msr, cr0;
559	int cpu, gsel_tss, x;
560	struct region_descriptor ap_gdt;
561
562	/* Set by the startup code for us to use */
563	cpu = bootAP;
564
565	/* Init tss */
566	common_tss[cpu] = common_tss[0];
567	common_tss[cpu].tss_rsp0 = 0;   /* not used until after switch */
568	common_tss[cpu].tss_iobase = sizeof(struct amd64tss) +
569	    IOPAGES * PAGE_SIZE;
570	common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE];
571
572	/* The NMI stack runs on IST2. */
573	np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1;
574	common_tss[cpu].tss_ist2 = (long) np;
575
576	/* Prepare private GDT */
577	gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu];
578	for (x = 0; x < NGDT; x++) {
579		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
580		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1))
581			ssdtosd(&gdt_segs[x], &gdt[NGDT * cpu + x]);
582	}
583	ssdtosyssd(&gdt_segs[GPROC0_SEL],
584	    (struct system_segment_descriptor *)&gdt[NGDT * cpu + GPROC0_SEL]);
585	ap_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
586	ap_gdt.rd_base =  (long) &gdt[NGDT * cpu];
587	lgdt(&ap_gdt);			/* does magic intra-segment return */
588
589	/* Get per-cpu data */
590	pc = &__pcpu[cpu];
591
592	/* prime data page for it to use */
593	pcpu_init(pc, cpu, sizeof(struct pcpu));
594	dpcpu_init(dpcpu, cpu);
595	pc->pc_apic_id = cpu_apic_ids[cpu];
596	pc->pc_prvspace = pc;
597	pc->pc_curthread = 0;
598	pc->pc_tssp = &common_tss[cpu];
599	pc->pc_commontssp = &common_tss[cpu];
600	pc->pc_rsp0 = 0;
601	pc->pc_tss = (struct system_segment_descriptor *)&gdt[NGDT * cpu +
602	    GPROC0_SEL];
603	pc->pc_fs32p = &gdt[NGDT * cpu + GUFS32_SEL];
604	pc->pc_gs32p = &gdt[NGDT * cpu + GUGS32_SEL];
605	pc->pc_ldt = (struct system_segment_descriptor *)&gdt[NGDT * cpu +
606	    GUSERLDT_SEL];
607
608	/* Save the per-cpu pointer for use by the NMI handler. */
609	np->np_pcpu = (register_t) pc;
610
611	wrmsr(MSR_FSBASE, 0);		/* User value */
612	wrmsr(MSR_GSBASE, (u_int64_t)pc);
613	wrmsr(MSR_KGSBASE, (u_int64_t)pc);	/* XXX User value while we're in the kernel */
614
615	lidt(&r_idt);
616
617	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
618	ltr(gsel_tss);
619
620	/*
621	 * Set to a known state:
622	 * Set by mpboot.s: CR0_PG, CR0_PE
623	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
624	 */
625	cr0 = rcr0();
626	cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
627	load_cr0(cr0);
628
629	/* Set up the fast syscall stuff */
630	msr = rdmsr(MSR_EFER) | EFER_SCE;
631	wrmsr(MSR_EFER, msr);
632	wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
633	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
634	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
635	      ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
636	wrmsr(MSR_STAR, msr);
637	wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
638
639	/* Disable local APIC just to be sure. */
640	lapic_disable();
641
642	/* signal our startup to the BSP. */
643	mp_naps++;
644
645	/* Spin until the BSP releases the AP's. */
646	while (!aps_ready)
647		ia32_pause();
648
649	/* Initialize the PAT MSR. */
650	pmap_init_pat();
651
652	/* set up CPU registers and state */
653	cpu_setregs();
654
655	/* set up SSE/NX registers */
656	initializecpu();
657
658	/* set up FPU state on the AP */
659	fpuinit();
660
661	/* A quick check from sanity claus */
662	if (PCPU_GET(apic_id) != lapic_id()) {
663		printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
664		printf("SMP: actual apic_id = %d\n", lapic_id());
665		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
666		panic("cpuid mismatch! boom!!");
667	}
668
669	/* Initialize curthread. */
670	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
671	PCPU_SET(curthread, PCPU_GET(idlethread));
672
673	mca_init();
674
675	mtx_lock_spin(&ap_boot_mtx);
676
677	/* Init local apic for irq's */
678	lapic_setup(1);
679
680	/* Set memory range attributes for this CPU to match the BSP */
681	mem_range_AP_init();
682
683	smp_cpus++;
684
685	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid));
686	printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
687
688	/* Determine if we are a logical CPU. */
689	if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0)
690		logical_cpus_mask |= PCPU_GET(cpumask);
691
692	/* Determine if we are a hyperthread. */
693	if (hyperthreading_cpus > 1 &&
694	    PCPU_GET(apic_id) % hyperthreading_cpus != 0)
695		hyperthreading_cpus_mask |= PCPU_GET(cpumask);
696
697	/* Build our map of 'other' CPUs. */
698	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
699
700	if (bootverbose)
701		lapic_dump("AP");
702
703	if (smp_cpus == mp_ncpus) {
704		/* enable IPI's, tlb shootdown, freezes etc */
705		atomic_store_rel_int(&smp_started, 1);
706		smp_active = 1;	 /* historic */
707	}
708
709	/*
710	 * Enable global pages TLB extension
711	 * This also implicitly flushes the TLB
712	 */
713
714	load_cr4(rcr4() | CR4_PGE);
715	load_ds(_udatasel);
716	load_es(_udatasel);
717	load_fs(_ufssel);
718	mtx_unlock_spin(&ap_boot_mtx);
719
720	/* wait until all the AP's are up */
721	while (smp_started == 0)
722		ia32_pause();
723
724	sched_throw(NULL);
725
726	panic("scheduler returned us to %s", __func__);
727	/* NOTREACHED */
728}
729
730/*******************************************************************
731 * local functions and data
732 */
733
734/*
735 * We tell the I/O APIC code about all the CPUs we want to receive
736 * interrupts.  If we don't want certain CPUs to receive IRQs we
737 * can simply not tell the I/O APIC code about them in this function.
738 * We also do not tell it about the BSP since it tells itself about
739 * the BSP internally to work with UP kernels and on UP machines.
740 */
741static void
742set_interrupt_apic_ids(void)
743{
744	u_int i, apic_id;
745
746	for (i = 0; i < MAXCPU; i++) {
747		apic_id = cpu_apic_ids[i];
748		if (apic_id == -1)
749			continue;
750		if (cpu_info[apic_id].cpu_bsp)
751			continue;
752		if (cpu_info[apic_id].cpu_disabled)
753			continue;
754
755		/* Don't let hyperthreads service interrupts. */
756		if (hyperthreading_cpus > 1 &&
757		    apic_id % hyperthreading_cpus != 0)
758			continue;
759
760		intr_add_cpu(i);
761	}
762}
763
764/*
765 * Assign logical CPU IDs to local APICs.
766 */
767static void
768assign_cpu_ids(void)
769{
770	u_int i;
771
772	TUNABLE_INT_FETCH("machdep.hyperthreading_allowed",
773	    &hyperthreading_allowed);
774
775	/* Check for explicitly disabled CPUs. */
776	for (i = 0; i <= MAX_APIC_ID; i++) {
777		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp)
778			continue;
779
780		if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) {
781			cpu_info[i].cpu_hyperthread = 1;
782#if defined(SCHED_ULE)
783			/*
784			 * Don't use HT CPU if it has been disabled by a
785			 * tunable.
786			 */
787			if (hyperthreading_allowed == 0) {
788				cpu_info[i].cpu_disabled = 1;
789				continue;
790			}
791#endif
792		}
793
794		/* Don't use this CPU if it has been disabled by a tunable. */
795		if (resource_disabled("lapic", i)) {
796			cpu_info[i].cpu_disabled = 1;
797			continue;
798		}
799	}
800
801	/*
802	 * Assign CPU IDs to local APIC IDs and disable any CPUs
803	 * beyond MAXCPU.  CPU 0 is always assigned to the BSP.
804	 *
805	 * To minimize confusion for userland, we attempt to number
806	 * CPUs such that all threads and cores in a package are
807	 * grouped together.  For now we assume that the BSP is always
808	 * the first thread in a package and just start adding APs
809	 * starting with the BSP's APIC ID.
810	 */
811	mp_ncpus = 1;
812	cpu_apic_ids[0] = boot_cpu_id;
813	apic_cpuids[boot_cpu_id] = 0;
814	for (i = boot_cpu_id + 1; i != boot_cpu_id;
815	     i == MAX_APIC_ID ? i = 0 : i++) {
816		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp ||
817		    cpu_info[i].cpu_disabled)
818			continue;
819
820		if (mp_ncpus < MAXCPU) {
821			cpu_apic_ids[mp_ncpus] = i;
822			apic_cpuids[i] = mp_ncpus;
823			mp_ncpus++;
824		} else
825			cpu_info[i].cpu_disabled = 1;
826	}
827	KASSERT(mp_maxid >= mp_ncpus - 1,
828	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
829	    mp_ncpus));
830}
831
832/*
833 * start each AP in our list
834 */
835static int
836start_all_aps(void)
837{
838	vm_offset_t va = boot_address + KERNBASE;
839	u_int64_t *pt4, *pt3, *pt2;
840	u_int32_t mpbioswarmvec;
841	int apic_id, cpu, i;
842	u_char mpbiosreason;
843
844	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
845
846	/* install the AP 1st level boot code */
847	pmap_kenter(va, boot_address);
848	pmap_invalidate_page(kernel_pmap, va);
849	bcopy(mptramp_start, (void *)va, bootMP_size);
850
851	/* Locate the page tables, they'll be below the trampoline */
852	pt4 = (u_int64_t *)(uintptr_t)(mptramp_pagetables + KERNBASE);
853	pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t);
854	pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t);
855
856	/* Create the initial 1GB replicated page tables */
857	for (i = 0; i < 512; i++) {
858		/* Each slot of the level 4 pages points to the same level 3 page */
859		pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE);
860		pt4[i] |= PG_V | PG_RW | PG_U;
861
862		/* Each slot of the level 3 pages points to the same level 2 page */
863		pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE));
864		pt3[i] |= PG_V | PG_RW | PG_U;
865
866		/* The level 2 page slots are mapped with 2MB pages for 1GB. */
867		pt2[i] = i * (2 * 1024 * 1024);
868		pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
869	}
870
871	/* save the current value of the warm-start vector */
872	mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF);
873	outb(CMOS_REG, BIOS_RESET);
874	mpbiosreason = inb(CMOS_DATA);
875
876	/* setup a vector to our boot code */
877	*((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
878	*((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4);
879	outb(CMOS_REG, BIOS_RESET);
880	outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
881
882	/* start each AP */
883	for (cpu = 1; cpu < mp_ncpus; cpu++) {
884		apic_id = cpu_apic_ids[cpu];
885
886		/* allocate and set up an idle stack data page */
887		bootstacks[cpu] = (void *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE);
888		doublefault_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE);
889		nmi_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE);
890		dpcpu = (void *)kmem_alloc(kernel_map, DPCPU_SIZE);
891
892		bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8;
893		bootAP = cpu;
894
895		/* attempt to start the Application Processor */
896		if (!start_ap(apic_id)) {
897			/* restore the warmstart vector */
898			*(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
899			panic("AP #%d (PHY# %d) failed!", cpu, apic_id);
900		}
901
902		all_cpus |= (1 << cpu);		/* record AP in CPU map */
903	}
904
905	/* build our map of 'other' CPUs */
906	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
907
908	/* restore the warmstart vector */
909	*(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
910
911	outb(CMOS_REG, BIOS_RESET);
912	outb(CMOS_DATA, mpbiosreason);
913
914	/* number of APs actually started */
915	return mp_naps;
916}
917
918
919/*
920 * This function starts the AP (application processor) identified
921 * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
922 * to accomplish this.  This is necessary because of the nuances
923 * of the different hardware we might encounter.  It isn't pretty,
924 * but it seems to work.
925 */
926static int
927start_ap(int apic_id)
928{
929	int vector, ms;
930	int cpus;
931
932	/* calculate the vector */
933	vector = (boot_address >> 12) & 0xff;
934
935	/* used as a watchpoint to signal AP startup */
936	cpus = mp_naps;
937
938	/*
939	 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
940	 * and running the target CPU. OR this INIT IPI might be latched (P5
941	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
942	 * ignored.
943	 */
944
945	/* do an INIT IPI: assert RESET */
946	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
947	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
948
949	/* wait for pending status end */
950	lapic_ipi_wait(-1);
951
952	/* do an INIT IPI: deassert RESET */
953	lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL |
954	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0);
955
956	/* wait for pending status end */
957	DELAY(10000);		/* wait ~10mS */
958	lapic_ipi_wait(-1);
959
960	/*
961	 * next we do a STARTUP IPI: the previous INIT IPI might still be
962	 * latched, (P5 bug) this 1st STARTUP would then terminate
963	 * immediately, and the previously started INIT IPI would continue. OR
964	 * the previous INIT IPI has already run. and this STARTUP IPI will
965	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
966	 * will run.
967	 */
968
969	/* do a STARTUP IPI */
970	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
971	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
972	    vector, apic_id);
973	lapic_ipi_wait(-1);
974	DELAY(200);		/* wait ~200uS */
975
976	/*
977	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
978	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
979	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
980	 * recognized after hardware RESET or INIT IPI.
981	 */
982
983	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
984	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
985	    vector, apic_id);
986	lapic_ipi_wait(-1);
987	DELAY(200);		/* wait ~200uS */
988
989	/* Wait up to 5 seconds for it to start. */
990	for (ms = 0; ms < 5000; ms++) {
991		if (mp_naps > cpus)
992			return 1;	/* return SUCCESS */
993		DELAY(1000);
994	}
995	return 0;		/* return FAILURE */
996}
997
998/*
999 * Flush the TLB on all other CPU's
1000 */
1001static void
1002smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
1003{
1004	u_int ncpu;
1005
1006	ncpu = mp_ncpus - 1;	/* does not shootdown self */
1007	if (ncpu < 1)
1008		return;		/* no other cpus */
1009	if (!(read_rflags() & PSL_I))
1010		panic("%s: interrupts disabled", __func__);
1011	mtx_lock_spin(&smp_ipi_mtx);
1012	smp_tlb_addr1 = addr1;
1013	smp_tlb_addr2 = addr2;
1014	atomic_store_rel_int(&smp_tlb_wait, 0);
1015	ipi_all_but_self(vector);
1016	while (smp_tlb_wait < ncpu)
1017		ia32_pause();
1018	mtx_unlock_spin(&smp_ipi_mtx);
1019}
1020
1021static void
1022smp_targeted_tlb_shootdown(cpumask_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
1023{
1024	int ncpu, othercpus;
1025
1026	othercpus = mp_ncpus - 1;
1027	if (mask == (u_int)-1) {
1028		ncpu = othercpus;
1029		if (ncpu < 1)
1030			return;
1031	} else {
1032		mask &= ~PCPU_GET(cpumask);
1033		if (mask == 0)
1034			return;
1035		ncpu = bitcount32(mask);
1036		if (ncpu > othercpus) {
1037			/* XXX this should be a panic offence */
1038			printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
1039			    ncpu, othercpus);
1040			ncpu = othercpus;
1041		}
1042		/* XXX should be a panic, implied by mask == 0 above */
1043		if (ncpu < 1)
1044			return;
1045	}
1046	if (!(read_rflags() & PSL_I))
1047		panic("%s: interrupts disabled", __func__);
1048	mtx_lock_spin(&smp_ipi_mtx);
1049	smp_tlb_addr1 = addr1;
1050	smp_tlb_addr2 = addr2;
1051	atomic_store_rel_int(&smp_tlb_wait, 0);
1052	if (mask == (u_int)-1)
1053		ipi_all_but_self(vector);
1054	else
1055		ipi_selected(mask, vector);
1056	while (smp_tlb_wait < ncpu)
1057		ia32_pause();
1058	mtx_unlock_spin(&smp_ipi_mtx);
1059}
1060
1061void
1062smp_cache_flush(void)
1063{
1064
1065	if (smp_started)
1066		smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
1067}
1068
1069void
1070smp_invltlb(void)
1071{
1072
1073	if (smp_started) {
1074		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
1075	}
1076}
1077
1078void
1079smp_invlpg(vm_offset_t addr)
1080{
1081
1082	if (smp_started)
1083		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
1084}
1085
1086void
1087smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
1088{
1089
1090	if (smp_started) {
1091		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
1092	}
1093}
1094
1095void
1096smp_masked_invltlb(cpumask_t mask)
1097{
1098
1099	if (smp_started) {
1100		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
1101	}
1102}
1103
1104void
1105smp_masked_invlpg(cpumask_t mask, vm_offset_t addr)
1106{
1107
1108	if (smp_started) {
1109		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
1110	}
1111}
1112
1113void
1114smp_masked_invlpg_range(cpumask_t mask, vm_offset_t addr1, vm_offset_t addr2)
1115{
1116
1117	if (smp_started) {
1118		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
1119	}
1120}
1121
1122void
1123ipi_bitmap_handler(struct trapframe frame)
1124{
1125	int cpu = PCPU_GET(cpuid);
1126	u_int ipi_bitmap;
1127
1128	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
1129
1130	if (ipi_bitmap & (1 << IPI_PREEMPT))
1131		sched_preempt(curthread);
1132
1133	/* Nothing to do for AST */
1134
1135	if (ipi_bitmap & (1 << IPI_HARDCLOCK))
1136		hardclockintr(&frame);
1137
1138	if (ipi_bitmap & (1 << IPI_STATCLOCK))
1139		statclockintr(&frame);
1140
1141	if (ipi_bitmap & (1 << IPI_PROFCLOCK))
1142		profclockintr(&frame);
1143}
1144
1145/*
1146 * send an IPI to a set of cpus.
1147 */
1148void
1149ipi_selected(cpumask_t cpus, u_int ipi)
1150{
1151	int cpu;
1152	u_int bitmap = 0;
1153	u_int old_pending;
1154	u_int new_pending;
1155
1156	if (IPI_IS_BITMAPED(ipi)) {
1157		bitmap = 1 << ipi;
1158		ipi = IPI_BITMAP_VECTOR;
1159	}
1160
1161#ifdef STOP_NMI
1162	if (ipi == IPI_STOP && stop_cpus_with_nmi) {
1163		ipi_nmi_selected(cpus);
1164		return;
1165	}
1166#endif
1167	CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi);
1168	while ((cpu = ffs(cpus)) != 0) {
1169		cpu--;
1170		cpus &= ~(1 << cpu);
1171
1172		KASSERT(cpu_apic_ids[cpu] != -1,
1173		    ("IPI to non-existent CPU %d", cpu));
1174
1175		if (bitmap) {
1176			do {
1177				old_pending = cpu_ipi_pending[cpu];
1178				new_pending = old_pending | bitmap;
1179			} while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],old_pending, new_pending));
1180
1181			if (old_pending)
1182				continue;
1183		}
1184
1185		lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
1186	}
1187
1188}
1189
1190/*
1191 * send an IPI to all CPUs EXCEPT myself
1192 */
1193void
1194ipi_all_but_self(u_int ipi)
1195{
1196
1197	if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) {
1198		ipi_selected(PCPU_GET(other_cpus), ipi);
1199		return;
1200	}
1201	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1202	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
1203}
1204
1205#ifdef STOP_NMI
1206/*
1207 * send NMI IPI to selected CPUs
1208 */
1209
1210#define	BEFORE_SPIN	1000000
1211
1212static void
1213ipi_nmi_selected(cpumask_t cpus)
1214{
1215	int cpu;
1216	register_t icrlo;
1217
1218	icrlo = APIC_DELMODE_NMI | APIC_DESTMODE_PHY | APIC_LEVEL_ASSERT
1219		| APIC_TRIGMOD_EDGE;
1220
1221	CTR2(KTR_SMP, "%s: cpus: %x nmi", __func__, cpus);
1222
1223	atomic_set_int(&ipi_nmi_pending, cpus);
1224
1225	while ((cpu = ffs(cpus)) != 0) {
1226		cpu--;
1227		cpus &= ~(1 << cpu);
1228
1229		KASSERT(cpu_apic_ids[cpu] != -1,
1230		    ("IPI NMI to non-existent CPU %d", cpu));
1231
1232		/* Wait for an earlier IPI to finish. */
1233		if (!lapic_ipi_wait(BEFORE_SPIN))
1234			panic("ipi_nmi_selected: previous IPI has not cleared");
1235
1236		lapic_ipi_raw(icrlo, cpu_apic_ids[cpu]);
1237	}
1238}
1239
1240int
1241ipi_nmi_handler(void)
1242{
1243	int cpumask = PCPU_GET(cpumask);
1244
1245	if (!(ipi_nmi_pending & cpumask))
1246		return 1;
1247
1248	atomic_clear_int(&ipi_nmi_pending, cpumask);
1249	cpustop_handler();
1250	return 0;
1251}
1252
1253#endif /* STOP_NMI */
1254
1255/*
1256 * Handle an IPI_STOP by saving our current context and spinning until we
1257 * are resumed.
1258 */
1259void
1260cpustop_handler(void)
1261{
1262	int cpu = PCPU_GET(cpuid);
1263	int cpumask = PCPU_GET(cpumask);
1264
1265	savectx(&stoppcbs[cpu]);
1266
1267	/* Indicate that we are stopped */
1268	atomic_set_int(&stopped_cpus, cpumask);
1269
1270	/* Wait for restart */
1271	while (!(started_cpus & cpumask))
1272	    ia32_pause();
1273
1274	atomic_clear_int(&started_cpus, cpumask);
1275	atomic_clear_int(&stopped_cpus, cpumask);
1276
1277	if (cpu == 0 && cpustop_restartfunc != NULL) {
1278		cpustop_restartfunc();
1279		cpustop_restartfunc = NULL;
1280	}
1281}
1282
1283/*
1284 * Handle an IPI_SUSPEND by saving our current context and spinning until we
1285 * are resumed.
1286 */
1287void
1288cpususpend_handler(void)
1289{
1290	struct savefpu *stopfpu;
1291	register_t cr3, rf;
1292	int cpu = PCPU_GET(cpuid);
1293	int cpumask = PCPU_GET(cpumask);
1294
1295	rf = intr_disable();
1296	cr3 = rcr3();
1297	stopfpu = &stopxpcbs[cpu].xpcb_pcb.pcb_save;
1298	if (savectx2(&stopxpcbs[cpu])) {
1299		fpugetregs(curthread, stopfpu);
1300		wbinvd();
1301		atomic_set_int(&stopped_cpus, cpumask);
1302	} else
1303		fpusetregs(curthread, stopfpu);
1304
1305	/* Wait for resume */
1306	while (!(started_cpus & cpumask))
1307		ia32_pause();
1308
1309	atomic_clear_int(&started_cpus, cpumask);
1310	atomic_clear_int(&stopped_cpus, cpumask);
1311
1312	/* Restore CR3 and enable interrupts */
1313	load_cr3(cr3);
1314	lapic_setup(0);
1315	intr_restore(rf);
1316}
1317
1318/*
1319 * This is called once the rest of the system is up and running and we're
1320 * ready to let the AP's out of the pen.
1321 */
1322static void
1323release_aps(void *dummy __unused)
1324{
1325
1326	if (mp_ncpus == 1)
1327		return;
1328	atomic_store_rel_int(&aps_ready, 1);
1329	while (smp_started == 0)
1330		ia32_pause();
1331}
1332SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1333
1334static int
1335sysctl_hlt_cpus(SYSCTL_HANDLER_ARGS)
1336{
1337	cpumask_t mask;
1338	int error;
1339
1340	mask = hlt_cpus_mask;
1341	error = sysctl_handle_int(oidp, &mask, 0, req);
1342	if (error || !req->newptr)
1343		return (error);
1344
1345	if (logical_cpus_mask != 0 &&
1346	    (mask & logical_cpus_mask) == logical_cpus_mask)
1347		hlt_logical_cpus = 1;
1348	else
1349		hlt_logical_cpus = 0;
1350
1351	if (! hyperthreading_allowed)
1352		mask |= hyperthreading_cpus_mask;
1353
1354	if ((mask & all_cpus) == all_cpus)
1355		mask &= ~(1<<0);
1356	hlt_cpus_mask = mask;
1357	return (error);
1358}
1359SYSCTL_PROC(_machdep, OID_AUTO, hlt_cpus, CTLTYPE_INT|CTLFLAG_RW,
1360    0, 0, sysctl_hlt_cpus, "IU",
1361    "Bitmap of CPUs to halt.  101 (binary) will halt CPUs 0 and 2.");
1362
1363static int
1364sysctl_hlt_logical_cpus(SYSCTL_HANDLER_ARGS)
1365{
1366	int disable, error;
1367
1368	disable = hlt_logical_cpus;
1369	error = sysctl_handle_int(oidp, &disable, 0, req);
1370	if (error || !req->newptr)
1371		return (error);
1372
1373	if (disable)
1374		hlt_cpus_mask |= logical_cpus_mask;
1375	else
1376		hlt_cpus_mask &= ~logical_cpus_mask;
1377
1378	if (! hyperthreading_allowed)
1379		hlt_cpus_mask |= hyperthreading_cpus_mask;
1380
1381	if ((hlt_cpus_mask & all_cpus) == all_cpus)
1382		hlt_cpus_mask &= ~(1<<0);
1383
1384	hlt_logical_cpus = disable;
1385	return (error);
1386}
1387
1388static int
1389sysctl_hyperthreading_allowed(SYSCTL_HANDLER_ARGS)
1390{
1391	int allowed, error;
1392
1393	allowed = hyperthreading_allowed;
1394	error = sysctl_handle_int(oidp, &allowed, 0, req);
1395	if (error || !req->newptr)
1396		return (error);
1397
1398#ifdef SCHED_ULE
1399	/*
1400	 * SCHED_ULE doesn't allow enabling/disabling HT cores at
1401	 * run-time.
1402	 */
1403	if (allowed != hyperthreading_allowed)
1404		return (ENOTSUP);
1405	return (error);
1406#endif
1407
1408	if (allowed)
1409		hlt_cpus_mask &= ~hyperthreading_cpus_mask;
1410	else
1411		hlt_cpus_mask |= hyperthreading_cpus_mask;
1412
1413	if (logical_cpus_mask != 0 &&
1414	    (hlt_cpus_mask & logical_cpus_mask) == logical_cpus_mask)
1415		hlt_logical_cpus = 1;
1416	else
1417		hlt_logical_cpus = 0;
1418
1419	if ((hlt_cpus_mask & all_cpus) == all_cpus)
1420		hlt_cpus_mask &= ~(1<<0);
1421
1422	hyperthreading_allowed = allowed;
1423	return (error);
1424}
1425
1426static void
1427cpu_hlt_setup(void *dummy __unused)
1428{
1429
1430	if (logical_cpus_mask != 0) {
1431		TUNABLE_INT_FETCH("machdep.hlt_logical_cpus",
1432		    &hlt_logical_cpus);
1433		sysctl_ctx_init(&logical_cpu_clist);
1434		SYSCTL_ADD_PROC(&logical_cpu_clist,
1435		    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1436		    "hlt_logical_cpus", CTLTYPE_INT|CTLFLAG_RW, 0, 0,
1437		    sysctl_hlt_logical_cpus, "IU", "");
1438		SYSCTL_ADD_UINT(&logical_cpu_clist,
1439		    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1440		    "logical_cpus_mask", CTLTYPE_INT|CTLFLAG_RD,
1441		    &logical_cpus_mask, 0, "");
1442
1443		if (hlt_logical_cpus)
1444			hlt_cpus_mask |= logical_cpus_mask;
1445
1446		/*
1447		 * If necessary for security purposes, force
1448		 * hyperthreading off, regardless of the value
1449		 * of hlt_logical_cpus.
1450		 */
1451		if (hyperthreading_cpus_mask) {
1452			SYSCTL_ADD_PROC(&logical_cpu_clist,
1453			    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1454			    "hyperthreading_allowed", CTLTYPE_INT|CTLFLAG_RW,
1455			    0, 0, sysctl_hyperthreading_allowed, "IU", "");
1456			if (! hyperthreading_allowed)
1457				hlt_cpus_mask |= hyperthreading_cpus_mask;
1458		}
1459	}
1460}
1461SYSINIT(cpu_hlt, SI_SUB_SMP, SI_ORDER_ANY, cpu_hlt_setup, NULL);
1462
1463int
1464mp_grab_cpu_hlt(void)
1465{
1466	u_int mask = PCPU_GET(cpumask);
1467#ifdef MP_WATCHDOG
1468	u_int cpuid = PCPU_GET(cpuid);
1469#endif
1470	int retval;
1471
1472#ifdef MP_WATCHDOG
1473	ap_watchdog(cpuid);
1474#endif
1475
1476	retval = mask & hlt_cpus_mask;
1477	while (mask & hlt_cpus_mask)
1478		__asm __volatile("sti; hlt" : : : "memory");
1479	return (retval);
1480}
1481