mp_machdep.c revision 187880
1/*-
2 * Copyright (c) 1996, by Steve Passe
3 * Copyright (c) 2003, by Peter Wemm
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. The name of the developer may NOT be used to endorse or promote products
12 *    derived from this software without specific prior written permission.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/amd64/amd64/mp_machdep.c 187880 2009-01-29 09:22:56Z jeff $");
29
30#include "opt_cpu.h"
31#include "opt_kstack_pages.h"
32#include "opt_mp_watchdog.h"
33#include "opt_sched.h"
34
35#include <sys/param.h>
36#include <sys/systm.h>
37#include <sys/bus.h>
38#ifdef GPROF
39#include <sys/gmon.h>
40#endif
41#include <sys/kernel.h>
42#include <sys/ktr.h>
43#include <sys/lock.h>
44#include <sys/malloc.h>
45#include <sys/memrange.h>
46#include <sys/mutex.h>
47#include <sys/pcpu.h>
48#include <sys/proc.h>
49#include <sys/sched.h>
50#include <sys/smp.h>
51#include <sys/sysctl.h>
52
53#include <vm/vm.h>
54#include <vm/vm_param.h>
55#include <vm/pmap.h>
56#include <vm/vm_kern.h>
57#include <vm/vm_extern.h>
58
59#include <machine/apicreg.h>
60#include <machine/cputypes.h>
61#include <machine/md_var.h>
62#include <machine/mp_watchdog.h>
63#include <machine/pcb.h>
64#include <machine/psl.h>
65#include <machine/smp.h>
66#include <machine/specialreg.h>
67#include <machine/tss.h>
68
69#define WARMBOOT_TARGET		0
70#define WARMBOOT_OFF		(KERNBASE + 0x0467)
71#define WARMBOOT_SEG		(KERNBASE + 0x0469)
72
73#define CMOS_REG		(0x70)
74#define CMOS_DATA		(0x71)
75#define BIOS_RESET		(0x0f)
76#define BIOS_WARM		(0x0a)
77
78/* lock region used by kernel profiling */
79int	mcount_lock;
80
81int	mp_naps;		/* # of Applications processors */
82int	boot_cpu_id = -1;	/* designated BSP */
83
84extern  struct pcpu __pcpu[];
85
86/* AP uses this during bootstrap.  Do not staticize.  */
87char *bootSTK;
88static int bootAP;
89
90/* Free these after use */
91void *bootstacks[MAXCPU];
92
93/* Temporary holder for double fault stack */
94char *doublefault_stack;
95
96/* Hotwire a 0->4MB V==P mapping */
97extern pt_entry_t *KPTphys;
98
99/* SMP page table page */
100extern pt_entry_t *SMPpt;
101
102extern int  _udatasel;
103
104struct pcb stoppcbs[MAXCPU];
105
106/* Variables needed for SMP tlb shootdown. */
107vm_offset_t smp_tlb_addr1;
108vm_offset_t smp_tlb_addr2;
109volatile int smp_tlb_wait;
110
111extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
112
113#ifdef STOP_NMI
114volatile cpumask_t ipi_nmi_pending;
115
116static void	ipi_nmi_selected(u_int32_t cpus);
117#endif
118
119/*
120 * Local data and functions.
121 */
122
123#ifdef STOP_NMI
124/*
125 * Provide an alternate method of stopping other CPUs. If another CPU has
126 * disabled interrupts the conventional STOP IPI will be blocked. This
127 * NMI-based stop should get through in that case.
128 */
129static int stop_cpus_with_nmi = 1;
130SYSCTL_INT(_debug, OID_AUTO, stop_cpus_with_nmi, CTLTYPE_INT | CTLFLAG_RW,
131    &stop_cpus_with_nmi, 0, "");
132TUNABLE_INT("debug.stop_cpus_with_nmi", &stop_cpus_with_nmi);
133#else
134#define	stop_cpus_with_nmi	0
135#endif
136
137static u_int logical_cpus;
138
139/* used to hold the AP's until we are ready to release them */
140static struct mtx ap_boot_mtx;
141
142/* Set to 1 once we're ready to let the APs out of the pen. */
143static volatile int aps_ready = 0;
144
145/*
146 * Store data from cpu_add() until later in the boot when we actually setup
147 * the APs.
148 */
149struct cpu_info {
150	int	cpu_present:1;
151	int	cpu_bsp:1;
152	int	cpu_disabled:1;
153} static cpu_info[MAX_APIC_ID + 1];
154int cpu_apic_ids[MAXCPU];
155int apic_cpuids[MAX_APIC_ID + 1];
156
157/* Holds pending bitmap based IPIs per CPU */
158static volatile u_int cpu_ipi_pending[MAXCPU];
159
160static u_int boot_address;
161
162static void	assign_cpu_ids(void);
163static void	set_interrupt_apic_ids(void);
164static int	start_all_aps(void);
165static int	start_ap(int apic_id);
166static void	release_aps(void *dummy);
167
168static int	hlt_logical_cpus;
169static u_int	hyperthreading_cpus;
170static cpumask_t	hyperthreading_cpus_mask;
171static int	hyperthreading_allowed = 1;
172static struct	sysctl_ctx_list logical_cpu_clist;
173static u_int	bootMP_size;
174
175static void
176mem_range_AP_init(void)
177{
178	if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
179		mem_range_softc.mr_op->initAP(&mem_range_softc);
180}
181
182struct cpu_group *
183cpu_topo(void)
184{
185	if (cpu_cores == 0)
186		cpu_cores = 1;
187	if (cpu_logical == 0)
188		cpu_logical = 1;
189	if (mp_ncpus % (cpu_cores * cpu_logical) != 0) {
190		printf("WARNING: Non-uniform processors.\n");
191		printf("WARNING: Using suboptimal topology.\n");
192		return (smp_topo_none());
193	}
194	/*
195	 * No multi-core or hyper-threaded.
196	 */
197	if (cpu_logical * cpu_cores == 1)
198		return (smp_topo_none());
199	/*
200	 * Only HTT no multi-core.
201	 */
202	if (cpu_logical > 1 && cpu_cores == 1)
203		return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
204	/*
205	 * Only multi-core no HTT.
206	 */
207	if (cpu_cores > 1 && cpu_logical == 1)
208		return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0));
209	/*
210	 * Both HTT and multi-core.
211	 */
212	return (smp_topo_2level(CG_SHARE_NONE, cpu_cores,
213	    CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
214}
215
216/*
217 * Calculate usable address in base memory for AP trampoline code.
218 */
219u_int
220mp_bootaddress(u_int basemem)
221{
222
223	bootMP_size = mptramp_end - mptramp_start;
224	boot_address = trunc_page(basemem * 1024); /* round down to 4k boundary */
225	if (((basemem * 1024) - boot_address) < bootMP_size)
226		boot_address -= PAGE_SIZE;	/* not enough, lower by 4k */
227	/* 3 levels of page table pages */
228	mptramp_pagetables = boot_address - (PAGE_SIZE * 3);
229
230	return mptramp_pagetables;
231}
232
233void
234cpu_add(u_int apic_id, char boot_cpu)
235{
236
237	if (apic_id > MAX_APIC_ID) {
238		panic("SMP: APIC ID %d too high", apic_id);
239		return;
240	}
241	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
242	    apic_id));
243	cpu_info[apic_id].cpu_present = 1;
244	if (boot_cpu) {
245		KASSERT(boot_cpu_id == -1,
246		    ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
247		    boot_cpu_id));
248		boot_cpu_id = apic_id;
249		cpu_info[apic_id].cpu_bsp = 1;
250	}
251	if (mp_ncpus < MAXCPU) {
252		mp_ncpus++;
253		mp_maxid = mp_ncpus -1;
254	}
255	if (bootverbose)
256		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
257		    "AP");
258}
259
260void
261cpu_mp_setmaxid(void)
262{
263
264	/*
265	 * mp_maxid should be already set by calls to cpu_add().
266	 * Just sanity check its value here.
267	 */
268	if (mp_ncpus == 0)
269		KASSERT(mp_maxid == 0,
270		    ("%s: mp_ncpus is zero, but mp_maxid is not", __func__));
271	else if (mp_ncpus == 1)
272		mp_maxid = 0;
273	else
274		KASSERT(mp_maxid >= mp_ncpus - 1,
275		    ("%s: counters out of sync: max %d, count %d", __func__,
276			mp_maxid, mp_ncpus));
277}
278
279int
280cpu_mp_probe(void)
281{
282
283	/*
284	 * Always record BSP in CPU map so that the mbuf init code works
285	 * correctly.
286	 */
287	all_cpus = 1;
288	if (mp_ncpus == 0) {
289		/*
290		 * No CPUs were found, so this must be a UP system.  Setup
291		 * the variables to represent a system with a single CPU
292		 * with an id of 0.
293		 */
294		mp_ncpus = 1;
295		return (0);
296	}
297
298	/* At least one CPU was found. */
299	if (mp_ncpus == 1) {
300		/*
301		 * One CPU was found, so this must be a UP system with
302		 * an I/O APIC.
303		 */
304		mp_maxid = 0;
305		return (0);
306	}
307
308	/* At least two CPUs were found. */
309	return (1);
310}
311
312/*
313 * Initialize the IPI handlers and start up the AP's.
314 */
315void
316cpu_mp_start(void)
317{
318	int i;
319	u_int threads_per_cache, p[4];
320
321	/* Initialize the logical ID to APIC ID table. */
322	for (i = 0; i < MAXCPU; i++) {
323		cpu_apic_ids[i] = -1;
324		cpu_ipi_pending[i] = 0;
325	}
326
327	/* Install an inter-CPU IPI for TLB invalidation */
328	setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0);
329	setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0);
330	setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0);
331
332	/* Install an inter-CPU IPI for cache invalidation. */
333	setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYSIGT, SEL_KPL, 0);
334
335	/* Install an inter-CPU IPI for all-CPU rendezvous */
336	setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0);
337
338	/* Install generic inter-CPU IPI handler */
339	setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler),
340	       SDT_SYSIGT, SEL_KPL, 0);
341
342	/* Install an inter-CPU IPI for CPU stop/restart */
343	setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0);
344
345	/* Set boot_cpu_id if needed. */
346	if (boot_cpu_id == -1) {
347		boot_cpu_id = PCPU_GET(apic_id);
348		cpu_info[boot_cpu_id].cpu_bsp = 1;
349	} else
350		KASSERT(boot_cpu_id == PCPU_GET(apic_id),
351		    ("BSP's APIC ID doesn't match boot_cpu_id"));
352	cpu_apic_ids[0] = boot_cpu_id;
353	apic_cpuids[boot_cpu_id] = 0;
354
355	assign_cpu_ids();
356
357	/* Start each Application Processor */
358	start_all_aps();
359
360	/* Setup the initial logical CPUs info. */
361	logical_cpus = logical_cpus_mask = 0;
362	if (cpu_feature & CPUID_HTT)
363		logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
364
365	/*
366	 * Work out if hyperthreading is *really* enabled.  This
367	 * is made really ugly by the fact that processors lie: Dual
368	 * core processors claim to be hyperthreaded even when they're
369	 * not, presumably because they want to be treated the same
370	 * way as HTT with respect to per-cpu software licensing.
371	 * At the time of writing (May 12, 2005) the only hyperthreaded
372	 * cpus are from Intel, and Intel's dual-core processors can be
373	 * identified via the "deterministic cache parameters" cpuid
374	 * calls.
375	 */
376	/*
377	 * First determine if this is an Intel processor which claims
378	 * to have hyperthreading support.
379	 */
380	if ((cpu_feature & CPUID_HTT) && cpu_vendor_id == CPU_VENDOR_INTEL) {
381		/*
382		 * If the "deterministic cache parameters" cpuid calls
383		 * are available, use them.
384		 */
385		if (cpu_high >= 4) {
386			/* Ask the processor about the L1 cache. */
387			for (i = 0; i < 1; i++) {
388				cpuid_count(4, i, p);
389				threads_per_cache = ((p[0] & 0x3ffc000) >> 14) + 1;
390				if (hyperthreading_cpus < threads_per_cache)
391					hyperthreading_cpus = threads_per_cache;
392				if ((p[0] & 0x1f) == 0)
393					break;
394			}
395		}
396
397		/*
398		 * If the deterministic cache parameters are not
399		 * available, or if no caches were reported to exist,
400		 * just accept what the HTT flag indicated.
401		 */
402		if (hyperthreading_cpus == 0)
403			hyperthreading_cpus = logical_cpus;
404	}
405
406	set_interrupt_apic_ids();
407}
408
409
410/*
411 * Print various information about the SMP system hardware and setup.
412 */
413void
414cpu_mp_announce(void)
415{
416	int i, x;
417
418	/* List CPUs */
419	printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
420	for (i = 1, x = 0; x <= MAX_APIC_ID; x++) {
421		if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp)
422			continue;
423		if (cpu_info[x].cpu_disabled)
424			printf("  cpu (AP): APIC ID: %2d (disabled)\n", x);
425		else {
426			KASSERT(i < mp_ncpus,
427			    ("mp_ncpus and actual cpus are out of whack"));
428			printf(" cpu%d (AP): APIC ID: %2d\n", i++, x);
429		}
430	}
431}
432
433/*
434 * AP CPU's call this to initialize themselves.
435 */
436void
437init_secondary(void)
438{
439	struct pcpu *pc;
440	u_int64_t msr, cr0;
441	int cpu, gsel_tss, x;
442	struct region_descriptor ap_gdt;
443
444	/* Set by the startup code for us to use */
445	cpu = bootAP;
446
447	/* Init tss */
448	common_tss[cpu] = common_tss[0];
449	common_tss[cpu].tss_rsp0 = 0;   /* not used until after switch */
450	common_tss[cpu].tss_iobase = sizeof(struct amd64tss);
451	common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE];
452
453	/* Prepare private GDT */
454	gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu];
455	ssdtosyssd(&gdt_segs[GPROC0_SEL],
456	   (struct system_segment_descriptor *)&gdt[NGDT * cpu + GPROC0_SEL]);
457	for (x = 0; x < NGDT; x++) {
458		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1))
459			ssdtosd(&gdt_segs[x], &gdt[NGDT * cpu + x]);
460	}
461	ap_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
462	ap_gdt.rd_base =  (long) &gdt[NGDT * cpu];
463	lgdt(&ap_gdt);			/* does magic intra-segment return */
464
465	/* Get per-cpu data */
466	pc = &__pcpu[cpu];
467
468	/* prime data page for it to use */
469	pcpu_init(pc, cpu, sizeof(struct pcpu));
470	pc->pc_apic_id = cpu_apic_ids[cpu];
471	pc->pc_prvspace = pc;
472	pc->pc_curthread = 0;
473	pc->pc_tssp = &common_tss[cpu];
474	pc->pc_rsp0 = 0;
475	pc->pc_gs32p = &gdt[NGDT * cpu + GUGS32_SEL];
476
477	wrmsr(MSR_FSBASE, 0);		/* User value */
478	wrmsr(MSR_GSBASE, (u_int64_t)pc);
479	wrmsr(MSR_KGSBASE, (u_int64_t)pc);	/* XXX User value while we're in the kernel */
480
481	lidt(&r_idt);
482
483	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
484	ltr(gsel_tss);
485
486	/*
487	 * Set to a known state:
488	 * Set by mpboot.s: CR0_PG, CR0_PE
489	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
490	 */
491	cr0 = rcr0();
492	cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
493	load_cr0(cr0);
494
495	/* Set up the fast syscall stuff */
496	msr = rdmsr(MSR_EFER) | EFER_SCE;
497	wrmsr(MSR_EFER, msr);
498	wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
499	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
500	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
501	      ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
502	wrmsr(MSR_STAR, msr);
503	wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
504
505	/* Disable local APIC just to be sure. */
506	lapic_disable();
507
508	/* signal our startup to the BSP. */
509	mp_naps++;
510
511	/* Spin until the BSP releases the AP's. */
512	while (!aps_ready)
513		ia32_pause();
514
515	/* Initialize the PAT MSR. */
516	pmap_init_pat();
517
518	/* set up CPU registers and state */
519	cpu_setregs();
520
521	/* set up SSE/NX registers */
522	initializecpu();
523
524	/* set up FPU state on the AP */
525	fpuinit();
526
527	/* A quick check from sanity claus */
528	if (PCPU_GET(apic_id) != lapic_id()) {
529		printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
530		printf("SMP: actual apic_id = %d\n", lapic_id());
531		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
532		panic("cpuid mismatch! boom!!");
533	}
534
535	/* Initialize curthread. */
536	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
537	PCPU_SET(curthread, PCPU_GET(idlethread));
538
539	mtx_lock_spin(&ap_boot_mtx);
540
541	/* Init local apic for irq's */
542	lapic_setup(1);
543
544	/* Set memory range attributes for this CPU to match the BSP */
545	mem_range_AP_init();
546
547	smp_cpus++;
548
549	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid));
550	printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
551
552	/* Determine if we are a logical CPU. */
553	if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0)
554		logical_cpus_mask |= PCPU_GET(cpumask);
555
556	/* Determine if we are a hyperthread. */
557	if (hyperthreading_cpus > 1 &&
558	    PCPU_GET(apic_id) % hyperthreading_cpus != 0)
559		hyperthreading_cpus_mask |= PCPU_GET(cpumask);
560
561	/* Build our map of 'other' CPUs. */
562	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
563
564	if (bootverbose)
565		lapic_dump("AP");
566
567	if (smp_cpus == mp_ncpus) {
568		/* enable IPI's, tlb shootdown, freezes etc */
569		atomic_store_rel_int(&smp_started, 1);
570		smp_active = 1;	 /* historic */
571	}
572
573	/*
574	 * Enable global pages TLB extension
575	 * This also implicitly flushes the TLB
576	 */
577
578	load_cr4(rcr4() | CR4_PGE);
579	load_ds(_udatasel);
580	load_es(_udatasel);
581	load_fs(_udatasel);
582	mtx_unlock_spin(&ap_boot_mtx);
583
584	/* wait until all the AP's are up */
585	while (smp_started == 0)
586		ia32_pause();
587
588	sched_throw(NULL);
589
590	panic("scheduler returned us to %s", __func__);
591	/* NOTREACHED */
592}
593
594/*******************************************************************
595 * local functions and data
596 */
597
598/*
599 * We tell the I/O APIC code about all the CPUs we want to receive
600 * interrupts.  If we don't want certain CPUs to receive IRQs we
601 * can simply not tell the I/O APIC code about them in this function.
602 * We also do not tell it about the BSP since it tells itself about
603 * the BSP internally to work with UP kernels and on UP machines.
604 */
605static void
606set_interrupt_apic_ids(void)
607{
608	u_int i, apic_id;
609
610	for (i = 0; i < MAXCPU; i++) {
611		apic_id = cpu_apic_ids[i];
612		if (apic_id == -1)
613			continue;
614		if (cpu_info[apic_id].cpu_bsp)
615			continue;
616		if (cpu_info[apic_id].cpu_disabled)
617			continue;
618
619		/* Don't let hyperthreads service interrupts. */
620		if (hyperthreading_cpus > 1 &&
621		    apic_id % hyperthreading_cpus != 0)
622			continue;
623
624		intr_add_cpu(i);
625	}
626}
627
628/*
629 * Assign logical CPU IDs to local APICs.
630 */
631static void
632assign_cpu_ids(void)
633{
634	u_int i;
635
636	/* Check for explicitly disabled CPUs. */
637	for (i = 0; i <= MAX_APIC_ID; i++) {
638		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp)
639			continue;
640
641		/* Don't use this CPU if it has been disabled by a tunable. */
642		if (resource_disabled("lapic", i)) {
643			cpu_info[i].cpu_disabled = 1;
644			continue;
645		}
646	}
647
648	/*
649	 * Assign CPU IDs to local APIC IDs and disable any CPUs
650	 * beyond MAXCPU.  CPU 0 has already been assigned to the BSP,
651	 * so we only have to assign IDs for APs.
652	 */
653	mp_ncpus = 1;
654	for (i = 0; i <= MAX_APIC_ID; i++) {
655		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp ||
656		    cpu_info[i].cpu_disabled)
657			continue;
658
659		if (mp_ncpus < MAXCPU) {
660			cpu_apic_ids[mp_ncpus] = i;
661			apic_cpuids[i] = mp_ncpus;
662			mp_ncpus++;
663		} else
664			cpu_info[i].cpu_disabled = 1;
665	}
666	KASSERT(mp_maxid >= mp_ncpus - 1,
667	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
668	    mp_ncpus));
669}
670
671/*
672 * start each AP in our list
673 */
674static int
675start_all_aps(void)
676{
677	vm_offset_t va = boot_address + KERNBASE;
678	u_int64_t *pt4, *pt3, *pt2;
679	u_int32_t mpbioswarmvec;
680	int apic_id, cpu, i;
681	u_char mpbiosreason;
682
683	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
684
685	/* install the AP 1st level boot code */
686	pmap_kenter(va, boot_address);
687	pmap_invalidate_page(kernel_pmap, va);
688	bcopy(mptramp_start, (void *)va, bootMP_size);
689
690	/* Locate the page tables, they'll be below the trampoline */
691	pt4 = (u_int64_t *)(uintptr_t)(mptramp_pagetables + KERNBASE);
692	pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t);
693	pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t);
694
695	/* Create the initial 1GB replicated page tables */
696	for (i = 0; i < 512; i++) {
697		/* Each slot of the level 4 pages points to the same level 3 page */
698		pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE);
699		pt4[i] |= PG_V | PG_RW | PG_U;
700
701		/* Each slot of the level 3 pages points to the same level 2 page */
702		pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE));
703		pt3[i] |= PG_V | PG_RW | PG_U;
704
705		/* The level 2 page slots are mapped with 2MB pages for 1GB. */
706		pt2[i] = i * (2 * 1024 * 1024);
707		pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
708	}
709
710	/* save the current value of the warm-start vector */
711	mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF);
712	outb(CMOS_REG, BIOS_RESET);
713	mpbiosreason = inb(CMOS_DATA);
714
715	/* setup a vector to our boot code */
716	*((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
717	*((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4);
718	outb(CMOS_REG, BIOS_RESET);
719	outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
720
721	/* start each AP */
722	for (cpu = 1; cpu < mp_ncpus; cpu++) {
723		apic_id = cpu_apic_ids[cpu];
724
725		/* allocate and set up an idle stack data page */
726		bootstacks[cpu] = (void *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE);
727		doublefault_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE);
728
729		bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8;
730		bootAP = cpu;
731
732		/* attempt to start the Application Processor */
733		if (!start_ap(apic_id)) {
734			/* restore the warmstart vector */
735			*(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
736			panic("AP #%d (PHY# %d) failed!", cpu, apic_id);
737		}
738
739		all_cpus |= (1 << cpu);		/* record AP in CPU map */
740	}
741
742	/* build our map of 'other' CPUs */
743	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
744
745	/* restore the warmstart vector */
746	*(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
747
748	outb(CMOS_REG, BIOS_RESET);
749	outb(CMOS_DATA, mpbiosreason);
750
751	/* number of APs actually started */
752	return mp_naps;
753}
754
755
756/*
757 * This function starts the AP (application processor) identified
758 * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
759 * to accomplish this.  This is necessary because of the nuances
760 * of the different hardware we might encounter.  It isn't pretty,
761 * but it seems to work.
762 */
763static int
764start_ap(int apic_id)
765{
766	int vector, ms;
767	int cpus;
768
769	/* calculate the vector */
770	vector = (boot_address >> 12) & 0xff;
771
772	/* used as a watchpoint to signal AP startup */
773	cpus = mp_naps;
774
775	/*
776	 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
777	 * and running the target CPU. OR this INIT IPI might be latched (P5
778	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
779	 * ignored.
780	 */
781
782	/* do an INIT IPI: assert RESET */
783	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
784	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
785
786	/* wait for pending status end */
787	lapic_ipi_wait(-1);
788
789	/* do an INIT IPI: deassert RESET */
790	lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL |
791	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0);
792
793	/* wait for pending status end */
794	DELAY(10000);		/* wait ~10mS */
795	lapic_ipi_wait(-1);
796
797	/*
798	 * next we do a STARTUP IPI: the previous INIT IPI might still be
799	 * latched, (P5 bug) this 1st STARTUP would then terminate
800	 * immediately, and the previously started INIT IPI would continue. OR
801	 * the previous INIT IPI has already run. and this STARTUP IPI will
802	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
803	 * will run.
804	 */
805
806	/* do a STARTUP IPI */
807	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
808	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
809	    vector, apic_id);
810	lapic_ipi_wait(-1);
811	DELAY(200);		/* wait ~200uS */
812
813	/*
814	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
815	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
816	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
817	 * recognized after hardware RESET or INIT IPI.
818	 */
819
820	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
821	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
822	    vector, apic_id);
823	lapic_ipi_wait(-1);
824	DELAY(200);		/* wait ~200uS */
825
826	/* Wait up to 5 seconds for it to start. */
827	for (ms = 0; ms < 5000; ms++) {
828		if (mp_naps > cpus)
829			return 1;	/* return SUCCESS */
830		DELAY(1000);
831	}
832	return 0;		/* return FAILURE */
833}
834
835/*
836 * Flush the TLB on all other CPU's
837 */
838static void
839smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
840{
841	u_int ncpu;
842
843	ncpu = mp_ncpus - 1;	/* does not shootdown self */
844	if (ncpu < 1)
845		return;		/* no other cpus */
846	if (!(read_rflags() & PSL_I))
847		panic("%s: interrupts disabled", __func__);
848	mtx_lock_spin(&smp_ipi_mtx);
849	smp_tlb_addr1 = addr1;
850	smp_tlb_addr2 = addr2;
851	atomic_store_rel_int(&smp_tlb_wait, 0);
852	ipi_all_but_self(vector);
853	while (smp_tlb_wait < ncpu)
854		ia32_pause();
855	mtx_unlock_spin(&smp_ipi_mtx);
856}
857
858static void
859smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
860{
861	int ncpu, othercpus;
862
863	othercpus = mp_ncpus - 1;
864	if (mask == (u_int)-1) {
865		ncpu = othercpus;
866		if (ncpu < 1)
867			return;
868	} else {
869		mask &= ~PCPU_GET(cpumask);
870		if (mask == 0)
871			return;
872		ncpu = bitcount32(mask);
873		if (ncpu > othercpus) {
874			/* XXX this should be a panic offence */
875			printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
876			    ncpu, othercpus);
877			ncpu = othercpus;
878		}
879		/* XXX should be a panic, implied by mask == 0 above */
880		if (ncpu < 1)
881			return;
882	}
883	if (!(read_rflags() & PSL_I))
884		panic("%s: interrupts disabled", __func__);
885	mtx_lock_spin(&smp_ipi_mtx);
886	smp_tlb_addr1 = addr1;
887	smp_tlb_addr2 = addr2;
888	atomic_store_rel_int(&smp_tlb_wait, 0);
889	if (mask == (u_int)-1)
890		ipi_all_but_self(vector);
891	else
892		ipi_selected(mask, vector);
893	while (smp_tlb_wait < ncpu)
894		ia32_pause();
895	mtx_unlock_spin(&smp_ipi_mtx);
896}
897
898void
899smp_cache_flush(void)
900{
901
902	if (smp_started)
903		smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
904}
905
906void
907smp_invltlb(void)
908{
909
910	if (smp_started) {
911		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
912	}
913}
914
915void
916smp_invlpg(vm_offset_t addr)
917{
918
919	if (smp_started)
920		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
921}
922
923void
924smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
925{
926
927	if (smp_started) {
928		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
929	}
930}
931
932void
933smp_masked_invltlb(u_int mask)
934{
935
936	if (smp_started) {
937		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
938	}
939}
940
941void
942smp_masked_invlpg(u_int mask, vm_offset_t addr)
943{
944
945	if (smp_started) {
946		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
947	}
948}
949
950void
951smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2)
952{
953
954	if (smp_started) {
955		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
956	}
957}
958
959void
960ipi_bitmap_handler(struct trapframe frame)
961{
962	int cpu = PCPU_GET(cpuid);
963	u_int ipi_bitmap;
964
965	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
966
967	if (ipi_bitmap & (1 << IPI_PREEMPT))
968		sched_preempt(curthread);
969
970	/* Nothing to do for AST */
971}
972
973/*
974 * send an IPI to a set of cpus.
975 */
976void
977ipi_selected(u_int32_t cpus, u_int ipi)
978{
979	int cpu;
980	u_int bitmap = 0;
981	u_int old_pending;
982	u_int new_pending;
983
984	if (IPI_IS_BITMAPED(ipi)) {
985		bitmap = 1 << ipi;
986		ipi = IPI_BITMAP_VECTOR;
987	}
988
989#ifdef STOP_NMI
990	if (ipi == IPI_STOP && stop_cpus_with_nmi) {
991		ipi_nmi_selected(cpus);
992		return;
993	}
994#endif
995	CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi);
996	while ((cpu = ffs(cpus)) != 0) {
997		cpu--;
998		cpus &= ~(1 << cpu);
999
1000		KASSERT(cpu_apic_ids[cpu] != -1,
1001		    ("IPI to non-existent CPU %d", cpu));
1002
1003		if (bitmap) {
1004			do {
1005				old_pending = cpu_ipi_pending[cpu];
1006				new_pending = old_pending | bitmap;
1007			} while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],old_pending, new_pending));
1008
1009			if (old_pending)
1010				continue;
1011		}
1012
1013		lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
1014	}
1015
1016}
1017
1018/*
1019 * send an IPI to all CPUs EXCEPT myself
1020 */
1021void
1022ipi_all_but_self(u_int ipi)
1023{
1024
1025	if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) {
1026		ipi_selected(PCPU_GET(other_cpus), ipi);
1027		return;
1028	}
1029	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1030	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
1031}
1032
1033#ifdef STOP_NMI
1034/*
1035 * send NMI IPI to selected CPUs
1036 */
1037
1038#define	BEFORE_SPIN	1000000
1039
1040void
1041ipi_nmi_selected(u_int32_t cpus)
1042{
1043	int cpu;
1044	register_t icrlo;
1045
1046	icrlo = APIC_DELMODE_NMI | APIC_DESTMODE_PHY | APIC_LEVEL_ASSERT
1047		| APIC_TRIGMOD_EDGE;
1048
1049	CTR2(KTR_SMP, "%s: cpus: %x nmi", __func__, cpus);
1050
1051	atomic_set_int(&ipi_nmi_pending, cpus);
1052
1053	while ((cpu = ffs(cpus)) != 0) {
1054		cpu--;
1055		cpus &= ~(1 << cpu);
1056
1057		KASSERT(cpu_apic_ids[cpu] != -1,
1058		    ("IPI NMI to non-existent CPU %d", cpu));
1059
1060		/* Wait for an earlier IPI to finish. */
1061		if (!lapic_ipi_wait(BEFORE_SPIN))
1062			panic("ipi_nmi_selected: previous IPI has not cleared");
1063
1064		lapic_ipi_raw(icrlo, cpu_apic_ids[cpu]);
1065	}
1066}
1067
1068int
1069ipi_nmi_handler(void)
1070{
1071	int cpumask = PCPU_GET(cpumask);
1072
1073	if (!(ipi_nmi_pending & cpumask))
1074		return 1;
1075
1076	atomic_clear_int(&ipi_nmi_pending, cpumask);
1077	cpustop_handler();
1078	return 0;
1079}
1080
1081#endif /* STOP_NMI */
1082
1083/*
1084 * Handle an IPI_STOP by saving our current context and spinning until we
1085 * are resumed.
1086 */
1087void
1088cpustop_handler(void)
1089{
1090	int cpu = PCPU_GET(cpuid);
1091	int cpumask = PCPU_GET(cpumask);
1092
1093	savectx(&stoppcbs[cpu]);
1094
1095	/* Indicate that we are stopped */
1096	atomic_set_int(&stopped_cpus, cpumask);
1097
1098	/* Wait for restart */
1099	while (!(started_cpus & cpumask))
1100	    ia32_pause();
1101
1102	atomic_clear_int(&started_cpus, cpumask);
1103	atomic_clear_int(&stopped_cpus, cpumask);
1104
1105	if (cpu == 0 && cpustop_restartfunc != NULL) {
1106		cpustop_restartfunc();
1107		cpustop_restartfunc = NULL;
1108	}
1109}
1110
1111/*
1112 * This is called once the rest of the system is up and running and we're
1113 * ready to let the AP's out of the pen.
1114 */
1115static void
1116release_aps(void *dummy __unused)
1117{
1118
1119	if (mp_ncpus == 1)
1120		return;
1121	atomic_store_rel_int(&aps_ready, 1);
1122	while (smp_started == 0)
1123		ia32_pause();
1124}
1125SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1126
1127static int
1128sysctl_hlt_cpus(SYSCTL_HANDLER_ARGS)
1129{
1130	u_int mask;
1131	int error;
1132
1133	mask = hlt_cpus_mask;
1134	error = sysctl_handle_int(oidp, &mask, 0, req);
1135	if (error || !req->newptr)
1136		return (error);
1137
1138	if (logical_cpus_mask != 0 &&
1139	    (mask & logical_cpus_mask) == logical_cpus_mask)
1140		hlt_logical_cpus = 1;
1141	else
1142		hlt_logical_cpus = 0;
1143
1144	if (! hyperthreading_allowed)
1145		mask |= hyperthreading_cpus_mask;
1146
1147	if ((mask & all_cpus) == all_cpus)
1148		mask &= ~(1<<0);
1149	hlt_cpus_mask = mask;
1150	return (error);
1151}
1152SYSCTL_PROC(_machdep, OID_AUTO, hlt_cpus, CTLTYPE_INT|CTLFLAG_RW,
1153    0, 0, sysctl_hlt_cpus, "IU",
1154    "Bitmap of CPUs to halt.  101 (binary) will halt CPUs 0 and 2.");
1155
1156static int
1157sysctl_hlt_logical_cpus(SYSCTL_HANDLER_ARGS)
1158{
1159	int disable, error;
1160
1161	disable = hlt_logical_cpus;
1162	error = sysctl_handle_int(oidp, &disable, 0, req);
1163	if (error || !req->newptr)
1164		return (error);
1165
1166	if (disable)
1167		hlt_cpus_mask |= logical_cpus_mask;
1168	else
1169		hlt_cpus_mask &= ~logical_cpus_mask;
1170
1171	if (! hyperthreading_allowed)
1172		hlt_cpus_mask |= hyperthreading_cpus_mask;
1173
1174	if ((hlt_cpus_mask & all_cpus) == all_cpus)
1175		hlt_cpus_mask &= ~(1<<0);
1176
1177	hlt_logical_cpus = disable;
1178	return (error);
1179}
1180
1181static int
1182sysctl_hyperthreading_allowed(SYSCTL_HANDLER_ARGS)
1183{
1184	int allowed, error;
1185
1186	allowed = hyperthreading_allowed;
1187	error = sysctl_handle_int(oidp, &allowed, 0, req);
1188	if (error || !req->newptr)
1189		return (error);
1190
1191	if (allowed)
1192		hlt_cpus_mask &= ~hyperthreading_cpus_mask;
1193	else
1194		hlt_cpus_mask |= hyperthreading_cpus_mask;
1195
1196	if (logical_cpus_mask != 0 &&
1197	    (hlt_cpus_mask & logical_cpus_mask) == logical_cpus_mask)
1198		hlt_logical_cpus = 1;
1199	else
1200		hlt_logical_cpus = 0;
1201
1202	if ((hlt_cpus_mask & all_cpus) == all_cpus)
1203		hlt_cpus_mask &= ~(1<<0);
1204
1205	hyperthreading_allowed = allowed;
1206	return (error);
1207}
1208
1209static void
1210cpu_hlt_setup(void *dummy __unused)
1211{
1212
1213	if (logical_cpus_mask != 0) {
1214		TUNABLE_INT_FETCH("machdep.hlt_logical_cpus",
1215		    &hlt_logical_cpus);
1216		sysctl_ctx_init(&logical_cpu_clist);
1217		SYSCTL_ADD_PROC(&logical_cpu_clist,
1218		    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1219		    "hlt_logical_cpus", CTLTYPE_INT|CTLFLAG_RW, 0, 0,
1220		    sysctl_hlt_logical_cpus, "IU", "");
1221		SYSCTL_ADD_UINT(&logical_cpu_clist,
1222		    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1223		    "logical_cpus_mask", CTLTYPE_INT|CTLFLAG_RD,
1224		    &logical_cpus_mask, 0, "");
1225
1226		if (hlt_logical_cpus)
1227			hlt_cpus_mask |= logical_cpus_mask;
1228
1229		/*
1230		 * If necessary for security purposes, force
1231		 * hyperthreading off, regardless of the value
1232		 * of hlt_logical_cpus.
1233		 */
1234		if (hyperthreading_cpus_mask) {
1235			TUNABLE_INT_FETCH("machdep.hyperthreading_allowed",
1236			    &hyperthreading_allowed);
1237			SYSCTL_ADD_PROC(&logical_cpu_clist,
1238			    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1239			    "hyperthreading_allowed", CTLTYPE_INT|CTLFLAG_RW,
1240			    0, 0, sysctl_hyperthreading_allowed, "IU", "");
1241			if (! hyperthreading_allowed)
1242				hlt_cpus_mask |= hyperthreading_cpus_mask;
1243		}
1244	}
1245}
1246SYSINIT(cpu_hlt, SI_SUB_SMP, SI_ORDER_ANY, cpu_hlt_setup, NULL);
1247
1248int
1249mp_grab_cpu_hlt(void)
1250{
1251	u_int mask = PCPU_GET(cpumask);
1252#ifdef MP_WATCHDOG
1253	u_int cpuid = PCPU_GET(cpuid);
1254#endif
1255	int retval;
1256
1257#ifdef MP_WATCHDOG
1258	ap_watchdog(cpuid);
1259#endif
1260
1261	retval = mask & hlt_cpus_mask;
1262	while (mask & hlt_cpus_mask)
1263		__asm __volatile("sti; hlt" : : : "memory");
1264	return (retval);
1265}
1266