mp_x86.c revision 151631
1/*-
2 * Copyright (c) 1996, by Steve Passe
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. The name of the developer may NOT be used to endorse or promote products
11 *    derived from this software without specific prior written permission.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26#include <sys/cdefs.h>
27__FBSDID("$FreeBSD: head/sys/i386/i386/mp_machdep.c 151631 2005-10-24 20:31:04Z jhb $");
28
29#include "opt_apic.h"
30#include "opt_cpu.h"
31#include "opt_kdb.h"
32#include "opt_kstack_pages.h"
33#include "opt_mp_watchdog.h"
34#include "opt_sched.h"
35
36#if !defined(lint)
37#if !defined(SMP)
38#error How did you get here?
39#endif
40
41#ifndef DEV_APIC
42#error The apic device is required for SMP, add "device apic" to your config file.
43#endif
44#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT)
45#error SMP not supported with CPU_DISABLE_CMPXCHG
46#endif
47#endif /* not lint */
48
49#include <sys/param.h>
50#include <sys/systm.h>
51#include <sys/bus.h>
52#include <sys/cons.h>	/* cngetc() */
53#ifdef GPROF
54#include <sys/gmon.h>
55#endif
56#include <sys/kernel.h>
57#include <sys/ktr.h>
58#include <sys/lock.h>
59#include <sys/malloc.h>
60#include <sys/memrange.h>
61#include <sys/mutex.h>
62#include <sys/pcpu.h>
63#include <sys/proc.h>
64#include <sys/smp.h>
65#include <sys/sysctl.h>
66
67#include <vm/vm.h>
68#include <vm/vm_param.h>
69#include <vm/pmap.h>
70#include <vm/vm_kern.h>
71#include <vm/vm_extern.h>
72
73#include <machine/apicreg.h>
74#include <machine/clock.h>
75#include <machine/md_var.h>
76#include <machine/mp_watchdog.h>
77#include <machine/pcb.h>
78#include <machine/smp.h>
79#include <machine/smptests.h>	/** COUNT_XINVLTLB_HITS */
80#include <machine/specialreg.h>
81#include <machine/privatespace.h>
82
83#define WARMBOOT_TARGET		0
84#define WARMBOOT_OFF		(KERNBASE + 0x0467)
85#define WARMBOOT_SEG		(KERNBASE + 0x0469)
86
87#define CMOS_REG		(0x70)
88#define CMOS_DATA		(0x71)
89#define BIOS_RESET		(0x0f)
90#define BIOS_WARM		(0x0a)
91
92/*
93 * this code MUST be enabled here and in mpboot.s.
94 * it follows the very early stages of AP boot by placing values in CMOS ram.
95 * it NORMALLY will never be needed and thus the primitive method for enabling.
96 *
97#define CHECK_POINTS
98 */
99
100#if defined(CHECK_POINTS) && !defined(PC98)
101#define CHECK_READ(A)	 (outb(CMOS_REG, (A)), inb(CMOS_DATA))
102#define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D)))
103
104#define CHECK_INIT(D);				\
105	CHECK_WRITE(0x34, (D));			\
106	CHECK_WRITE(0x35, (D));			\
107	CHECK_WRITE(0x36, (D));			\
108	CHECK_WRITE(0x37, (D));			\
109	CHECK_WRITE(0x38, (D));			\
110	CHECK_WRITE(0x39, (D));
111
112#define CHECK_PRINT(S);				\
113	printf("%s: %d, %d, %d, %d, %d, %d\n",	\
114	   (S),					\
115	   CHECK_READ(0x34),			\
116	   CHECK_READ(0x35),			\
117	   CHECK_READ(0x36),			\
118	   CHECK_READ(0x37),			\
119	   CHECK_READ(0x38),			\
120	   CHECK_READ(0x39));
121
122#else				/* CHECK_POINTS */
123
124#define CHECK_INIT(D)
125#define CHECK_PRINT(S)
126#define CHECK_WRITE(A, D)
127
128#endif				/* CHECK_POINTS */
129
130/*
131 * Values to send to the POST hardware.
132 */
133#define MP_BOOTADDRESS_POST	0x10
134#define MP_PROBE_POST		0x11
135#define MPTABLE_PASS1_POST	0x12
136
137#define MP_START_POST		0x13
138#define MP_ENABLE_POST		0x14
139#define MPTABLE_PASS2_POST	0x15
140
141#define START_ALL_APS_POST	0x16
142#define INSTALL_AP_TRAMP_POST	0x17
143#define START_AP_POST		0x18
144
145#define MP_ANNOUNCE_POST	0x19
146
147/* lock region used by kernel profiling */
148int	mcount_lock;
149
150/** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */
151int	current_postcode;
152
153int	mp_naps;		/* # of Applications processors */
154int	boot_cpu_id = -1;	/* designated BSP */
155extern	int nkpt;
156
157/*
158 * CPU topology map datastructures for HTT.
159 */
160static struct cpu_group mp_groups[MAXCPU];
161static struct cpu_top mp_top;
162
163/* AP uses this during bootstrap.  Do not staticize.  */
164char *bootSTK;
165static int bootAP;
166
167/* Hotwire a 0->4MB V==P mapping */
168extern pt_entry_t *KPTphys;
169
170/* SMP page table page */
171extern pt_entry_t *SMPpt;
172
173struct pcb stoppcbs[MAXCPU];
174
175/* Variables needed for SMP tlb shootdown. */
176vm_offset_t smp_tlb_addr1;
177vm_offset_t smp_tlb_addr2;
178volatile int smp_tlb_wait;
179
180#ifdef KDB_STOP_NMI
181volatile cpumask_t ipi_nmi_pending;
182#endif
183
184#ifdef COUNT_IPIS
185/* Interrupt counts. */
186#ifdef IPI_PREEMPTION
187static u_long *ipi_preempt_counts[MAXCPU];
188#endif
189static u_long *ipi_ast_counts[MAXCPU];
190u_long *ipi_invltlb_counts[MAXCPU];
191u_long *ipi_invlrng_counts[MAXCPU];
192u_long *ipi_invlpg_counts[MAXCPU];
193u_long *ipi_rendezvous_counts[MAXCPU];
194u_long *ipi_lazypmap_counts[MAXCPU];
195#endif
196
197/*
198 * Local data and functions.
199 */
200
201static u_int logical_cpus;
202
203/* used to hold the AP's until we are ready to release them */
204static struct mtx ap_boot_mtx;
205
206/* Set to 1 once we're ready to let the APs out of the pen. */
207static volatile int aps_ready = 0;
208
209/*
210 * Store data from cpu_add() until later in the boot when we actually setup
211 * the APs.
212 */
213struct cpu_info {
214	int	cpu_present:1;
215	int	cpu_bsp:1;
216	int	cpu_disabled:1;
217} static cpu_info[MAXCPU];
218static int cpu_apic_ids[MAXCPU];
219
220/* Holds pending bitmap based IPIs per CPU */
221static volatile u_int cpu_ipi_pending[MAXCPU];
222
223static u_int boot_address;
224
225static void	set_logical_apic_ids(void);
226static int	start_all_aps(void);
227static void	install_ap_tramp(void);
228static int	start_ap(int apic_id);
229static void	release_aps(void *dummy);
230
231static int	hlt_logical_cpus;
232static u_int	hyperthreading_cpus;
233static cpumask_t	hyperthreading_cpus_mask;
234static int	hyperthreading_allowed = 1;
235static struct	sysctl_ctx_list logical_cpu_clist;
236
237static void
238mem_range_AP_init(void)
239{
240	if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
241		mem_range_softc.mr_op->initAP(&mem_range_softc);
242}
243
244void
245mp_topology(void)
246{
247	struct cpu_group *group;
248	int logical_cpus;
249	int apic_id;
250	int groups;
251	int cpu;
252
253	/* Build the smp_topology map. */
254	/* Nothing to do if there is no HTT support. */
255	if ((cpu_feature & CPUID_HTT) == 0)
256		return;
257	logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
258	if (logical_cpus <= 1)
259		return;
260	group = &mp_groups[0];
261	groups = 1;
262	for (cpu = 0, apic_id = 0; apic_id < MAXCPU; apic_id++) {
263		if (!cpu_info[apic_id].cpu_present)
264			continue;
265		/*
266		 * If the current group has members and we're not a logical
267		 * cpu, create a new group.
268		 */
269		if (group->cg_count != 0 && (apic_id % logical_cpus) == 0) {
270			group++;
271			groups++;
272		}
273		group->cg_count++;
274		group->cg_mask |= 1 << cpu;
275		cpu++;
276	}
277
278	mp_top.ct_count = groups;
279	mp_top.ct_group = mp_groups;
280	smp_topology = &mp_top;
281}
282
283
284/*
285 * Calculate usable address in base memory for AP trampoline code.
286 */
287u_int
288mp_bootaddress(u_int basemem)
289{
290	POSTCODE(MP_BOOTADDRESS_POST);
291
292	boot_address = trunc_page(basemem);	/* round down to 4k boundary */
293	if ((basemem - boot_address) < bootMP_size)
294		boot_address -= PAGE_SIZE;	/* not enough, lower by 4k */
295
296	return boot_address;
297}
298
299void
300cpu_add(u_int apic_id, char boot_cpu)
301{
302
303	if (apic_id >= MAXCPU) {
304		printf("SMP: CPU %d exceeds maximum CPU %d, ignoring\n",
305		    apic_id, MAXCPU - 1);
306		return;
307	}
308	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
309	    apic_id));
310	cpu_info[apic_id].cpu_present = 1;
311	if (boot_cpu) {
312		KASSERT(boot_cpu_id == -1,
313		    ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
314		    boot_cpu_id));
315		boot_cpu_id = apic_id;
316		cpu_info[apic_id].cpu_bsp = 1;
317	}
318	mp_ncpus++;
319	if (bootverbose)
320		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
321		    "AP");
322
323}
324
325void
326cpu_mp_setmaxid(void)
327{
328
329	mp_maxid = MAXCPU - 1;
330}
331
332int
333cpu_mp_probe(void)
334{
335
336	/*
337	 * Always record BSP in CPU map so that the mbuf init code works
338	 * correctly.
339	 */
340	all_cpus = 1;
341	if (mp_ncpus == 0) {
342		/*
343		 * No CPUs were found, so this must be a UP system.  Setup
344		 * the variables to represent a system with a single CPU
345		 * with an id of 0.
346		 */
347		mp_ncpus = 1;
348		return (0);
349	}
350
351	/* At least one CPU was found. */
352	if (mp_ncpus == 1) {
353		/*
354		 * One CPU was found, so this must be a UP system with
355		 * an I/O APIC.
356		 */
357		return (0);
358	}
359
360	/* At least two CPUs were found. */
361	return (1);
362}
363
364/*
365 * Initialize the IPI handlers and start up the AP's.
366 */
367void
368cpu_mp_start(void)
369{
370	int i;
371	u_int threads_per_cache, p[4];
372
373	POSTCODE(MP_START_POST);
374
375	/* Initialize the logical ID to APIC ID table. */
376	for (i = 0; i < MAXCPU; i++) {
377		cpu_apic_ids[i] = -1;
378		cpu_ipi_pending[i] = 0;
379	}
380
381	/* Install an inter-CPU IPI for TLB invalidation */
382	setidt(IPI_INVLTLB, IDTVEC(invltlb),
383	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
384	setidt(IPI_INVLPG, IDTVEC(invlpg),
385	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
386	setidt(IPI_INVLRNG, IDTVEC(invlrng),
387	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
388
389	/* Install an inter-CPU IPI for lazy pmap release */
390	setidt(IPI_LAZYPMAP, IDTVEC(lazypmap),
391	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
392
393	/* Install an inter-CPU IPI for all-CPU rendezvous */
394	setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous),
395	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
396
397	/* Install generic inter-CPU IPI handler */
398	setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler),
399	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
400
401	/* Install an inter-CPU IPI for CPU stop/restart */
402	setidt(IPI_STOP, IDTVEC(cpustop),
403	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
404
405
406	/* Set boot_cpu_id if needed. */
407	if (boot_cpu_id == -1) {
408		boot_cpu_id = PCPU_GET(apic_id);
409		cpu_info[boot_cpu_id].cpu_bsp = 1;
410	} else
411		KASSERT(boot_cpu_id == PCPU_GET(apic_id),
412		    ("BSP's APIC ID doesn't match boot_cpu_id"));
413	cpu_apic_ids[0] = boot_cpu_id;
414
415	/* Start each Application Processor */
416	start_all_aps();
417
418	/* Setup the initial logical CPUs info. */
419	logical_cpus = logical_cpus_mask = 0;
420	if (cpu_feature & CPUID_HTT)
421		logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
422
423	/*
424	 * Work out if hyperthreading is *really* enabled.  This
425	 * is made really ugly by the fact that processors lie: Dual
426	 * core processors claim to be hyperthreaded even when they're
427	 * not, presumably because they want to be treated the same
428	 * way as HTT with respect to per-cpu software licensing.
429	 * At the time of writing (May 12, 2005) the only hyperthreaded
430	 * cpus are from Intel, and Intel's dual-core processors can be
431	 * identified via the "deterministic cache parameters" cpuid
432	 * calls.
433	 */
434	/*
435	 * First determine if this is an Intel processor which claims
436	 * to have hyperthreading support.
437	 */
438	if ((cpu_feature & CPUID_HTT) &&
439	    (strcmp(cpu_vendor, "GenuineIntel") == 0)) {
440		/*
441		 * If the "deterministic cache parameters" cpuid calls
442		 * are available, use them.
443		 */
444		if (cpu_high >= 4) {
445			/* Ask the processor about up to 32 caches. */
446			for (i = 0; i < 32; i++) {
447				cpuid_count(4, i, p);
448				threads_per_cache = ((p[0] & 0x3ffc000) >> 14) + 1;
449				if (hyperthreading_cpus < threads_per_cache)
450					hyperthreading_cpus = threads_per_cache;
451				if ((p[0] & 0x1f) == 0)
452					break;
453			}
454		}
455
456		/*
457		 * If the deterministic cache parameters are not
458		 * available, or if no caches were reported to exist,
459		 * just accept what the HTT flag indicated.
460		 */
461		if (hyperthreading_cpus == 0)
462			hyperthreading_cpus = logical_cpus;
463	}
464
465	set_logical_apic_ids();
466}
467
468
469/*
470 * Print various information about the SMP system hardware and setup.
471 */
472void
473cpu_mp_announce(void)
474{
475	int i, x;
476
477	POSTCODE(MP_ANNOUNCE_POST);
478
479	/* List CPUs */
480	printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
481	for (i = 1, x = 0; x < MAXCPU; x++) {
482		if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp)
483			continue;
484		if (cpu_info[x].cpu_disabled)
485			printf("  cpu (AP): APIC ID: %2d (disabled)\n", x);
486		else {
487			KASSERT(i < mp_ncpus,
488			    ("mp_ncpus and actual cpus are out of whack"));
489			printf(" cpu%d (AP): APIC ID: %2d\n", i++, x);
490		}
491	}
492}
493
494/*
495 * AP CPU's call this to initialize themselves.
496 */
497void
498init_secondary(void)
499{
500	vm_offset_t addr;
501	int	gsel_tss;
502	int	x, myid;
503	u_int	cr0;
504
505	/* bootAP is set in start_ap() to our ID. */
506	myid = bootAP;
507	gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[myid];
508	gdt_segs[GPROC0_SEL].ssd_base =
509		(int) &SMP_prvspace[myid].pcpu.pc_common_tss;
510	SMP_prvspace[myid].pcpu.pc_prvspace =
511		&SMP_prvspace[myid].pcpu;
512
513	for (x = 0; x < NGDT; x++) {
514		ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd);
515	}
516
517	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
518	r_gdt.rd_base = (int) &gdt[myid * NGDT];
519	lgdt(&r_gdt);			/* does magic intra-segment return */
520
521	lidt(&r_idt);
522
523	lldt(_default_ldt);
524	PCPU_SET(currentldt, _default_ldt);
525
526	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
527	gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
528	PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */
529	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
530	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
531	PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd);
532	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
533	ltr(gsel_tss);
534
535	PCPU_SET(fsgs_gdt, &gdt[myid * NGDT + GUFS_SEL].sd);
536
537	/*
538	 * Set to a known state:
539	 * Set by mpboot.s: CR0_PG, CR0_PE
540	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
541	 */
542	cr0 = rcr0();
543	cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
544	load_cr0(cr0);
545	CHECK_WRITE(0x38, 5);
546
547	/* Disable local APIC just to be sure. */
548	lapic_disable();
549
550	/* signal our startup to the BSP. */
551	mp_naps++;
552	CHECK_WRITE(0x39, 6);
553
554	/* Spin until the BSP releases the AP's. */
555	while (!aps_ready)
556		ia32_pause();
557
558	/* BSP may have changed PTD while we were waiting */
559	invltlb();
560	for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE)
561		invlpg(addr);
562
563#if defined(I586_CPU) && !defined(NO_F00F_HACK)
564	lidt(&r_idt);
565#endif
566
567	/* set up CPU registers and state */
568	cpu_setregs();
569
570	/* set up FPU state on the AP */
571	npxinit(__INITIAL_NPXCW__);
572
573	/* set up SSE registers */
574	enable_sse();
575
576	/* A quick check from sanity claus */
577	if (PCPU_GET(apic_id) != lapic_id()) {
578		printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
579		printf("SMP: actual apic_id = %d\n", lapic_id());
580		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
581		printf("PTD[MPPTDI] = %#jx\n", (uintmax_t)PTD[MPPTDI]);
582		panic("cpuid mismatch! boom!!");
583	}
584
585	/* Initialize curthread. */
586	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
587	PCPU_SET(curthread, PCPU_GET(idlethread));
588
589	mtx_lock_spin(&ap_boot_mtx);
590
591	/* Init local apic for irq's */
592	lapic_setup();
593
594	/* Set memory range attributes for this CPU to match the BSP */
595	mem_range_AP_init();
596
597	smp_cpus++;
598
599	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid));
600	printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
601
602	/* Determine if we are a logical CPU. */
603	if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0)
604		logical_cpus_mask |= PCPU_GET(cpumask);
605
606	/* Determine if we are a hyperthread. */
607	if (hyperthreading_cpus > 1 &&
608	    PCPU_GET(apic_id) % hyperthreading_cpus != 0)
609		hyperthreading_cpus_mask |= PCPU_GET(cpumask);
610
611	/* Build our map of 'other' CPUs. */
612	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
613
614	if (bootverbose)
615		lapic_dump("AP");
616
617	if (smp_cpus == mp_ncpus) {
618		/* enable IPI's, tlb shootdown, freezes etc */
619		atomic_store_rel_int(&smp_started, 1);
620		smp_active = 1;	 /* historic */
621	}
622
623	mtx_unlock_spin(&ap_boot_mtx);
624
625	/* wait until all the AP's are up */
626	while (smp_started == 0)
627		ia32_pause();
628
629	/* ok, now grab sched_lock and enter the scheduler */
630	mtx_lock_spin(&sched_lock);
631
632	/*
633	 * Correct spinlock nesting.  The idle thread context that we are
634	 * borrowing was created so that it would start out with a single
635	 * spin lock (sched_lock) held in fork_trampoline().  Since we've
636	 * explicitly acquired locks in this function, the nesting count
637	 * is now 2 rather than 1.  Since we are nested, calling
638	 * spinlock_exit() will simply adjust the counts without allowing
639	 * spin lock using code to interrupt us.
640	 */
641	spinlock_exit();
642	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
643
644	binuptime(PCPU_PTR(switchtime));
645	PCPU_SET(switchticks, ticks);
646
647	cpu_throw(NULL, choosethread());	/* doesn't return */
648
649	panic("scheduler returned us to %s", __func__);
650	/* NOTREACHED */
651}
652
653/*******************************************************************
654 * local functions and data
655 */
656
657/*
658 * Set the APIC logical IDs.
659 *
660 * We want to cluster logical CPU's within the same APIC ID cluster.
661 * Since logical CPU's are aligned simply filling in the clusters in
662 * APIC ID order works fine.  Note that this does not try to balance
663 * the number of CPU's in each cluster. (XXX?)
664 */
665static void
666set_logical_apic_ids(void)
667{
668	u_int apic_id, cluster, cluster_id;
669
670	/* Force us to allocate cluster 0 at the start. */
671	cluster = -1;
672	cluster_id = APIC_MAX_INTRACLUSTER_ID;
673	for (apic_id = 0; apic_id < MAXCPU; apic_id++) {
674		if (!cpu_info[apic_id].cpu_present)
675			continue;
676		if (cluster_id == APIC_MAX_INTRACLUSTER_ID) {
677			cluster = ioapic_next_logical_cluster();
678			cluster_id = 0;
679		} else
680			cluster_id++;
681		if (bootverbose)
682			printf("APIC ID: physical %u, logical %u:%u\n",
683			    apic_id, cluster, cluster_id);
684		lapic_set_logical_id(apic_id, cluster, cluster_id);
685	}
686}
687
688/*
689 * start each AP in our list
690 */
691static int
692start_all_aps(void)
693{
694#ifndef PC98
695	u_char mpbiosreason;
696#endif
697	struct pcpu *pc;
698	char *stack;
699	uintptr_t kptbase;
700	u_int32_t mpbioswarmvec;
701	int apic_id, cpu, i, pg;
702
703	POSTCODE(START_ALL_APS_POST);
704
705	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
706
707	/* install the AP 1st level boot code */
708	install_ap_tramp();
709
710	/* save the current value of the warm-start vector */
711	mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF);
712#ifndef PC98
713	outb(CMOS_REG, BIOS_RESET);
714	mpbiosreason = inb(CMOS_DATA);
715#endif
716
717	/* set up temporary P==V mapping for AP boot */
718	/* XXX this is a hack, we should boot the AP on its own stack/PTD */
719	kptbase = (uintptr_t)(void *)KPTphys;
720	for (i = 0; i < NKPT; i++)
721		PTD[i] = (pd_entry_t)(PG_V | PG_RW |
722		    ((kptbase + i * PAGE_SIZE) & PG_FRAME));
723	invltlb();
724
725	/* start each AP */
726	for (cpu = 0, apic_id = 0; apic_id < MAXCPU; apic_id++) {
727
728		/* Ignore non-existent CPUs and the BSP. */
729		if (!cpu_info[apic_id].cpu_present ||
730		    cpu_info[apic_id].cpu_bsp)
731			continue;
732
733		/* Don't use this CPU if it has been disabled by a tunable. */
734		if (resource_disabled("lapic", apic_id)) {
735			cpu_info[apic_id].cpu_disabled = 1;
736			mp_ncpus--;
737			continue;
738		}
739
740		cpu++;
741
742		/* save APIC ID for this logical ID */
743		cpu_apic_ids[cpu] = apic_id;
744
745		/* first page of AP's private space */
746		pg = cpu * i386_btop(sizeof(struct privatespace));
747
748		/* allocate a new private data page */
749		pc = (struct pcpu *)kmem_alloc(kernel_map, PAGE_SIZE);
750
751		/* wire it into the private page table page */
752		SMPpt[pg] = (pt_entry_t)(PG_V | PG_RW | vtophys(pc));
753
754		/* allocate and set up an idle stack data page */
755		stack = (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE); /* XXXKSE */
756		for (i = 0; i < KSTACK_PAGES; i++)
757			SMPpt[pg + 1 + i] = (pt_entry_t)
758			    (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack));
759
760		/* prime data page for it to use */
761		pcpu_init(pc, cpu, sizeof(struct pcpu));
762		pc->pc_apic_id = apic_id;
763
764		/* setup a vector to our boot code */
765		*((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
766		*((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4);
767#ifndef PC98
768		outb(CMOS_REG, BIOS_RESET);
769		outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
770#endif
771
772		bootSTK = &SMP_prvspace[cpu].idlekstack[KSTACK_PAGES *
773		    PAGE_SIZE];
774		bootAP = cpu;
775
776		/* attempt to start the Application Processor */
777		CHECK_INIT(99);	/* setup checkpoints */
778		if (!start_ap(apic_id)) {
779			printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id);
780			CHECK_PRINT("trace");	/* show checkpoints */
781			/* better panic as the AP may be running loose */
782			printf("panic y/n? [y] ");
783			if (cngetc() != 'n')
784				panic("bye-bye");
785		}
786		CHECK_PRINT("trace");		/* show checkpoints */
787
788		all_cpus |= (1 << cpu);		/* record AP in CPU map */
789	}
790
791	/* build our map of 'other' CPUs */
792	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
793
794	/* restore the warmstart vector */
795	*(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
796
797#ifndef PC98
798	outb(CMOS_REG, BIOS_RESET);
799	outb(CMOS_DATA, mpbiosreason);
800#endif
801
802	/*
803	 * Set up the idle context for the BSP.  Similar to above except
804	 * that some was done by locore, some by pmap.c and some is implicit
805	 * because the BSP is cpu#0 and the page is initially zero and also
806	 * because we can refer to variables by name on the BSP..
807	 */
808
809	/* Allocate and setup BSP idle stack */
810	stack = (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE);
811	for (i = 0; i < KSTACK_PAGES; i++)
812		SMPpt[1 + i] = (pt_entry_t)
813		    (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack));
814
815	for (i = 0; i < NKPT; i++)
816		PTD[i] = 0;
817	pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1);
818
819	/* number of APs actually started */
820	return mp_naps;
821}
822
823/*
824 * load the 1st level AP boot code into base memory.
825 */
826
827/* targets for relocation */
828extern void bigJump(void);
829extern void bootCodeSeg(void);
830extern void bootDataSeg(void);
831extern void MPentry(void);
832extern u_int MP_GDT;
833extern u_int mp_gdtbase;
834
835static void
836install_ap_tramp(void)
837{
838	int     x;
839	int     size = *(int *) ((u_long) & bootMP_size);
840	vm_offset_t va = boot_address + KERNBASE;
841	u_char *src = (u_char *) ((u_long) bootMP);
842	u_char *dst = (u_char *) va;
843	u_int   boot_base = (u_int) bootMP;
844	u_int8_t *dst8;
845	u_int16_t *dst16;
846	u_int32_t *dst32;
847
848	POSTCODE(INSTALL_AP_TRAMP_POST);
849
850	KASSERT (size <= PAGE_SIZE,
851	    ("'size' do not fit into PAGE_SIZE, as expected."));
852	pmap_kenter(va, boot_address);
853	pmap_invalidate_page (kernel_pmap, va);
854	for (x = 0; x < size; ++x)
855		*dst++ = *src++;
856
857	/*
858	 * modify addresses in code we just moved to basemem. unfortunately we
859	 * need fairly detailed info about mpboot.s for this to work.  changes
860	 * to mpboot.s might require changes here.
861	 */
862
863	/* boot code is located in KERNEL space */
864	dst = (u_char *) va;
865
866	/* modify the lgdt arg */
867	dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base));
868	*dst32 = boot_address + ((u_int) & MP_GDT - boot_base);
869
870	/* modify the ljmp target for MPentry() */
871	dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1);
872	*dst32 = ((u_int) MPentry - KERNBASE);
873
874	/* modify the target for boot code segment */
875	dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base));
876	dst8 = (u_int8_t *) (dst16 + 1);
877	*dst16 = (u_int) boot_address & 0xffff;
878	*dst8 = ((u_int) boot_address >> 16) & 0xff;
879
880	/* modify the target for boot data segment */
881	dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base));
882	dst8 = (u_int8_t *) (dst16 + 1);
883	*dst16 = (u_int) boot_address & 0xffff;
884	*dst8 = ((u_int) boot_address >> 16) & 0xff;
885}
886
887/*
888 * This function starts the AP (application processor) identified
889 * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
890 * to accomplish this.  This is necessary because of the nuances
891 * of the different hardware we might encounter.  It isn't pretty,
892 * but it seems to work.
893 */
894static int
895start_ap(int apic_id)
896{
897	int vector, ms;
898	int cpus;
899
900	POSTCODE(START_AP_POST);
901
902	/* calculate the vector */
903	vector = (boot_address >> 12) & 0xff;
904
905	/* used as a watchpoint to signal AP startup */
906	cpus = mp_naps;
907
908	/*
909	 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
910	 * and running the target CPU. OR this INIT IPI might be latched (P5
911	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
912	 * ignored.
913	 */
914
915	/* do an INIT IPI: assert RESET */
916	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
917	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
918
919	/* wait for pending status end */
920	lapic_ipi_wait(-1);
921
922	/* do an INIT IPI: deassert RESET */
923	lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL |
924	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0);
925
926	/* wait for pending status end */
927	DELAY(10000);		/* wait ~10mS */
928	lapic_ipi_wait(-1);
929
930	/*
931	 * next we do a STARTUP IPI: the previous INIT IPI might still be
932	 * latched, (P5 bug) this 1st STARTUP would then terminate
933	 * immediately, and the previously started INIT IPI would continue. OR
934	 * the previous INIT IPI has already run. and this STARTUP IPI will
935	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
936	 * will run.
937	 */
938
939	/* do a STARTUP IPI */
940	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
941	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
942	    vector, apic_id);
943	lapic_ipi_wait(-1);
944	DELAY(200);		/* wait ~200uS */
945
946	/*
947	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
948	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
949	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
950	 * recognized after hardware RESET or INIT IPI.
951	 */
952
953	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
954	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
955	    vector, apic_id);
956	lapic_ipi_wait(-1);
957	DELAY(200);		/* wait ~200uS */
958
959	/* Wait up to 5 seconds for it to start. */
960	for (ms = 0; ms < 5000; ms++) {
961		if (mp_naps > cpus)
962			return 1;	/* return SUCCESS */
963		DELAY(1000);
964	}
965	return 0;		/* return FAILURE */
966}
967
968#ifdef COUNT_XINVLTLB_HITS
969u_int xhits_gbl[MAXCPU];
970u_int xhits_pg[MAXCPU];
971u_int xhits_rng[MAXCPU];
972SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
973SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
974    sizeof(xhits_gbl), "IU", "");
975SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
976    sizeof(xhits_pg), "IU", "");
977SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
978    sizeof(xhits_rng), "IU", "");
979
980u_int ipi_global;
981u_int ipi_page;
982u_int ipi_range;
983u_int ipi_range_size;
984SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
985SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
986SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
987SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
988    0, "");
989
990u_int ipi_masked_global;
991u_int ipi_masked_page;
992u_int ipi_masked_range;
993u_int ipi_masked_range_size;
994SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
995    &ipi_masked_global, 0, "");
996SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
997    &ipi_masked_page, 0, "");
998SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
999    &ipi_masked_range, 0, "");
1000SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
1001    &ipi_masked_range_size, 0, "");
1002#endif /* COUNT_XINVLTLB_HITS */
1003
1004/*
1005 * Flush the TLB on all other CPU's
1006 */
1007static void
1008smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
1009{
1010	u_int ncpu;
1011
1012	ncpu = mp_ncpus - 1;	/* does not shootdown self */
1013	if (ncpu < 1)
1014		return;		/* no other cpus */
1015	mtx_assert(&smp_ipi_mtx, MA_OWNED);
1016	smp_tlb_addr1 = addr1;
1017	smp_tlb_addr2 = addr2;
1018	atomic_store_rel_int(&smp_tlb_wait, 0);
1019	ipi_all_but_self(vector);
1020	while (smp_tlb_wait < ncpu)
1021		ia32_pause();
1022}
1023
1024static void
1025smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
1026{
1027	int ncpu, othercpus;
1028
1029	othercpus = mp_ncpus - 1;
1030	if (mask == (u_int)-1) {
1031		ncpu = othercpus;
1032		if (ncpu < 1)
1033			return;
1034	} else {
1035		mask &= ~PCPU_GET(cpumask);
1036		if (mask == 0)
1037			return;
1038		ncpu = bitcount32(mask);
1039		if (ncpu > othercpus) {
1040			/* XXX this should be a panic offence */
1041			printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
1042			    ncpu, othercpus);
1043			ncpu = othercpus;
1044		}
1045		/* XXX should be a panic, implied by mask == 0 above */
1046		if (ncpu < 1)
1047			return;
1048	}
1049	mtx_assert(&smp_ipi_mtx, MA_OWNED);
1050	smp_tlb_addr1 = addr1;
1051	smp_tlb_addr2 = addr2;
1052	atomic_store_rel_int(&smp_tlb_wait, 0);
1053	if (mask == (u_int)-1)
1054		ipi_all_but_self(vector);
1055	else
1056		ipi_selected(mask, vector);
1057	while (smp_tlb_wait < ncpu)
1058		ia32_pause();
1059}
1060
1061void
1062smp_invltlb(void)
1063{
1064
1065	if (smp_started) {
1066		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
1067#ifdef COUNT_XINVLTLB_HITS
1068		ipi_global++;
1069#endif
1070	}
1071}
1072
1073void
1074smp_invlpg(vm_offset_t addr)
1075{
1076
1077	if (smp_started) {
1078		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
1079#ifdef COUNT_XINVLTLB_HITS
1080		ipi_page++;
1081#endif
1082	}
1083}
1084
1085void
1086smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
1087{
1088
1089	if (smp_started) {
1090		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
1091#ifdef COUNT_XINVLTLB_HITS
1092		ipi_range++;
1093		ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
1094#endif
1095	}
1096}
1097
1098void
1099smp_masked_invltlb(u_int mask)
1100{
1101
1102	if (smp_started) {
1103		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
1104#ifdef COUNT_XINVLTLB_HITS
1105		ipi_masked_global++;
1106#endif
1107	}
1108}
1109
1110void
1111smp_masked_invlpg(u_int mask, vm_offset_t addr)
1112{
1113
1114	if (smp_started) {
1115		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
1116#ifdef COUNT_XINVLTLB_HITS
1117		ipi_masked_page++;
1118#endif
1119	}
1120}
1121
1122void
1123smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2)
1124{
1125
1126	if (smp_started) {
1127		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
1128#ifdef COUNT_XINVLTLB_HITS
1129		ipi_masked_range++;
1130		ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
1131#endif
1132	}
1133}
1134
1135void
1136ipi_bitmap_handler(struct clockframe frame)
1137{
1138	int cpu = PCPU_GET(cpuid);
1139	u_int ipi_bitmap;
1140
1141	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
1142
1143#ifdef IPI_PREEMPTION
1144	if (ipi_bitmap & IPI_PREEMPT) {
1145#ifdef COUNT_IPIS
1146		*ipi_preempt_counts[cpu]++;
1147#endif
1148		mtx_lock_spin(&sched_lock);
1149		/* Don't preempt the idle thread */
1150		if (curthread->td_priority <  PRI_MIN_IDLE) {
1151			struct thread *running_thread = curthread;
1152			if (running_thread->td_critnest > 1)
1153				running_thread->td_owepreempt = 1;
1154			else
1155				mi_switch(SW_INVOL | SW_PREEMPT, NULL);
1156		}
1157		mtx_unlock_spin(&sched_lock);
1158	}
1159#endif
1160
1161	if (ipi_bitmap & IPI_AST) {
1162#ifdef COUNT_IPIS
1163		*ipi_ast_counts[cpu]++;
1164#endif
1165		/* Nothing to do for AST */
1166	}
1167}
1168
1169/*
1170 * send an IPI to a set of cpus.
1171 */
1172void
1173ipi_selected(u_int32_t cpus, u_int ipi)
1174{
1175	int cpu;
1176	u_int bitmap = 0;
1177	u_int old_pending;
1178	u_int new_pending;
1179
1180	if (IPI_IS_BITMAPED(ipi)) {
1181		bitmap = 1 << ipi;
1182		ipi = IPI_BITMAP_VECTOR;
1183	}
1184
1185	CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi);
1186	while ((cpu = ffs(cpus)) != 0) {
1187		cpu--;
1188		cpus &= ~(1 << cpu);
1189
1190		KASSERT(cpu_apic_ids[cpu] != -1,
1191		    ("IPI to non-existent CPU %d", cpu));
1192
1193		if (bitmap) {
1194			do {
1195				old_pending = cpu_ipi_pending[cpu];
1196				new_pending = old_pending | bitmap;
1197			} while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],old_pending, new_pending));
1198
1199			if (old_pending)
1200				continue;
1201		}
1202
1203		lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
1204	}
1205
1206}
1207
1208/*
1209 * send an IPI INTerrupt containing 'vector' to all CPUs, including myself
1210 */
1211void
1212ipi_all(u_int ipi)
1213{
1214
1215	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1216	lapic_ipi_vectored(ipi, APIC_IPI_DEST_ALL);
1217}
1218
1219/*
1220 * send an IPI to all CPUs EXCEPT myself
1221 */
1222void
1223ipi_all_but_self(u_int ipi)
1224{
1225
1226	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1227	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
1228}
1229
1230/*
1231 * send an IPI to myself
1232 */
1233void
1234ipi_self(u_int ipi)
1235{
1236
1237	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1238	lapic_ipi_vectored(ipi, APIC_IPI_DEST_SELF);
1239}
1240
1241#ifdef KDB_STOP_NMI
1242/*
1243 * send NMI IPI to selected CPUs
1244 */
1245
1246#define	BEFORE_SPIN	1000000
1247
1248void
1249ipi_nmi_selected(u_int32_t cpus)
1250{
1251	int cpu;
1252	register_t icrlo;
1253
1254	icrlo = APIC_DELMODE_NMI | APIC_DESTMODE_PHY | APIC_LEVEL_ASSERT
1255		| APIC_TRIGMOD_EDGE;
1256
1257	CTR2(KTR_SMP, "%s: cpus: %x nmi", __func__, cpus);
1258
1259	atomic_set_int(&ipi_nmi_pending, cpus);
1260
1261	while ((cpu = ffs(cpus)) != 0) {
1262		cpu--;
1263		cpus &= ~(1 << cpu);
1264
1265		KASSERT(cpu_apic_ids[cpu] != -1,
1266		    ("IPI NMI to non-existent CPU %d", cpu));
1267
1268		/* Wait for an earlier IPI to finish. */
1269		if (!lapic_ipi_wait(BEFORE_SPIN))
1270			panic("ipi_nmi_selected: previous IPI has not cleared");
1271
1272		lapic_ipi_raw(icrlo, cpu_apic_ids[cpu]);
1273	}
1274}
1275
1276
1277int
1278ipi_nmi_handler()
1279{
1280	int cpu = PCPU_GET(cpuid);
1281	int cpumask = PCPU_GET(cpumask);
1282
1283	if (!(atomic_load_acq_int(&ipi_nmi_pending) & cpumask))
1284		return 1;
1285
1286	atomic_clear_int(&ipi_nmi_pending, cpumask);
1287
1288	savectx(&stoppcbs[cpu]);
1289
1290	/* Indicate that we are stopped */
1291	atomic_set_int(&stopped_cpus, cpumask);
1292
1293	/* Wait for restart */
1294	while (!(atomic_load_acq_int(&started_cpus) & cpumask))
1295	    ia32_pause();
1296
1297	atomic_clear_int(&started_cpus, cpumask);
1298	atomic_clear_int(&stopped_cpus, cpumask);
1299
1300	if (cpu == 0 && cpustop_restartfunc != NULL)
1301		cpustop_restartfunc();
1302
1303	return 0;
1304}
1305
1306#endif /* KDB_STOP_NMI */
1307
1308/*
1309 * This is called once the rest of the system is up and running and we're
1310 * ready to let the AP's out of the pen.
1311 */
1312static void
1313release_aps(void *dummy __unused)
1314{
1315
1316	if (mp_ncpus == 1)
1317		return;
1318	mtx_lock_spin(&sched_lock);
1319	atomic_store_rel_int(&aps_ready, 1);
1320	while (smp_started == 0)
1321		ia32_pause();
1322	mtx_unlock_spin(&sched_lock);
1323}
1324SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1325
1326static int
1327sysctl_hlt_cpus(SYSCTL_HANDLER_ARGS)
1328{
1329	u_int mask;
1330	int error;
1331
1332	mask = hlt_cpus_mask;
1333	error = sysctl_handle_int(oidp, &mask, 0, req);
1334	if (error || !req->newptr)
1335		return (error);
1336
1337	if (logical_cpus_mask != 0 &&
1338	    (mask & logical_cpus_mask) == logical_cpus_mask)
1339		hlt_logical_cpus = 1;
1340	else
1341		hlt_logical_cpus = 0;
1342
1343	if (! hyperthreading_allowed)
1344		mask |= hyperthreading_cpus_mask;
1345
1346	if ((mask & all_cpus) == all_cpus)
1347		mask &= ~(1<<0);
1348	hlt_cpus_mask = mask;
1349	return (error);
1350}
1351SYSCTL_PROC(_machdep, OID_AUTO, hlt_cpus, CTLTYPE_INT|CTLFLAG_RW,
1352    0, 0, sysctl_hlt_cpus, "IU",
1353    "Bitmap of CPUs to halt.  101 (binary) will halt CPUs 0 and 2.");
1354
1355static int
1356sysctl_hlt_logical_cpus(SYSCTL_HANDLER_ARGS)
1357{
1358	int disable, error;
1359
1360	disable = hlt_logical_cpus;
1361	error = sysctl_handle_int(oidp, &disable, 0, req);
1362	if (error || !req->newptr)
1363		return (error);
1364
1365	if (disable)
1366		hlt_cpus_mask |= logical_cpus_mask;
1367	else
1368		hlt_cpus_mask &= ~logical_cpus_mask;
1369
1370	if (! hyperthreading_allowed)
1371		hlt_cpus_mask |= hyperthreading_cpus_mask;
1372
1373	if ((hlt_cpus_mask & all_cpus) == all_cpus)
1374		hlt_cpus_mask &= ~(1<<0);
1375
1376	hlt_logical_cpus = disable;
1377	return (error);
1378}
1379
1380static int
1381sysctl_hyperthreading_allowed(SYSCTL_HANDLER_ARGS)
1382{
1383	int allowed, error;
1384
1385	allowed = hyperthreading_allowed;
1386	error = sysctl_handle_int(oidp, &allowed, 0, req);
1387	if (error || !req->newptr)
1388		return (error);
1389
1390	if (allowed)
1391		hlt_cpus_mask &= ~hyperthreading_cpus_mask;
1392	else
1393		hlt_cpus_mask |= hyperthreading_cpus_mask;
1394
1395	if (logical_cpus_mask != 0 &&
1396	    (hlt_cpus_mask & logical_cpus_mask) == logical_cpus_mask)
1397		hlt_logical_cpus = 1;
1398	else
1399		hlt_logical_cpus = 0;
1400
1401	if ((hlt_cpus_mask & all_cpus) == all_cpus)
1402		hlt_cpus_mask &= ~(1<<0);
1403
1404	hyperthreading_allowed = allowed;
1405	return (error);
1406}
1407
1408static void
1409cpu_hlt_setup(void *dummy __unused)
1410{
1411
1412	if (logical_cpus_mask != 0) {
1413		TUNABLE_INT_FETCH("machdep.hlt_logical_cpus",
1414		    &hlt_logical_cpus);
1415		sysctl_ctx_init(&logical_cpu_clist);
1416		SYSCTL_ADD_PROC(&logical_cpu_clist,
1417		    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1418		    "hlt_logical_cpus", CTLTYPE_INT|CTLFLAG_RW, 0, 0,
1419		    sysctl_hlt_logical_cpus, "IU", "");
1420		SYSCTL_ADD_UINT(&logical_cpu_clist,
1421		    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1422		    "logical_cpus_mask", CTLTYPE_INT|CTLFLAG_RD,
1423		    &logical_cpus_mask, 0, "");
1424
1425		if (hlt_logical_cpus)
1426			hlt_cpus_mask |= logical_cpus_mask;
1427
1428		/*
1429		 * If necessary for security purposes, force
1430		 * hyperthreading off, regardless of the value
1431		 * of hlt_logical_cpus.
1432		 */
1433		if (hyperthreading_cpus_mask) {
1434			TUNABLE_INT_FETCH("machdep.hyperthreading_allowed",
1435			    &hyperthreading_allowed);
1436			SYSCTL_ADD_PROC(&logical_cpu_clist,
1437			    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
1438			    "hyperthreading_allowed", CTLTYPE_INT|CTLFLAG_RW,
1439			    0, 0, sysctl_hyperthreading_allowed, "IU", "");
1440			if (! hyperthreading_allowed)
1441				hlt_cpus_mask |= hyperthreading_cpus_mask;
1442		}
1443	}
1444}
1445SYSINIT(cpu_hlt, SI_SUB_SMP, SI_ORDER_ANY, cpu_hlt_setup, NULL);
1446
1447int
1448mp_grab_cpu_hlt(void)
1449{
1450	u_int mask = PCPU_GET(cpumask);
1451#ifdef MP_WATCHDOG
1452	u_int cpuid = PCPU_GET(cpuid);
1453#endif
1454	int retval;
1455
1456#ifdef MP_WATCHDOG
1457	ap_watchdog(cpuid);
1458#endif
1459
1460	retval = mask & hlt_cpus_mask;
1461	while (mask & hlt_cpus_mask)
1462		__asm __volatile("sti; hlt" : : : "memory");
1463	return (retval);
1464}
1465
1466#ifdef COUNT_IPIS
1467/*
1468 * Setup interrupt counters for IPI handlers.
1469 */
1470static void
1471mp_ipi_intrcnt(void *dummy)
1472{
1473	char buf[64];
1474	int i;
1475
1476	for (i = 0; i < mp_maxid; i++) {
1477		if (CPU_ABSENT(i))
1478			continue;
1479		snprintf(buf, sizeof(buf), "cpu%d: invltlb", i);
1480		intrcnt_add(buf, &ipi_invltlb_counts[i]);
1481		snprintf(buf, sizeof(buf), "cpu%d: invlrng", i);
1482		intrcnt_add(buf, &ipi_invlrng_counts[i]);
1483		snprintf(buf, sizeof(buf), "cpu%d: invlpg", i);
1484		intrcnt_add(buf, &ipi_invlpg_counts[i]);
1485#ifdef IPI_PREEMPTION
1486		snprintf(buf, sizeof(buf), "cpu%d: preempt", i);
1487		intrcnt_add(buf, &ipi_preempt_counts[i]);
1488#endif
1489		snprintf(buf, sizeof(buf), "cpu%d: ast", i);
1490		intrcnt_add(buf, &ipi_ast_counts[i]);
1491		snprintf(buf, sizeof(buf), "cpu%d: rendezvous", i);
1492		intrcnt_add(buf, &ipi_rendezvous_counts[i]);
1493		snprintf(buf, sizeof(buf), "cpu%d: lazypmap", i);
1494		intrcnt_add(buf, &ipi_lazypmap_counts[i]);
1495	}
1496}
1497SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL)
1498#endif
1499