mp_x86.c revision 121754
1/*-
2 * Copyright (c) 1996, by Steve Passe
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. The name of the developer may NOT be used to endorse or promote products
11 *    derived from this software without specific prior written permission.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26#include <sys/cdefs.h>
27__FBSDID("$FreeBSD: head/sys/i386/i386/mp_machdep.c 121754 2003-10-30 21:42:17Z jhb $");
28
29#include "opt_cpu.h"
30#include "opt_kstack_pages.h"
31
32#ifdef SMP
33#include <machine/smptests.h>
34#else
35#if !defined(lint)
36#error
37#endif
38#endif
39
40#include <sys/param.h>
41#include <sys/systm.h>
42#include <sys/bus.h>
43#include <sys/cons.h>	/* cngetc() */
44#ifdef GPROF
45#include <sys/gmon.h>
46#endif
47#include <sys/kernel.h>
48#include <sys/ktr.h>
49#include <sys/lock.h>
50#include <sys/malloc.h>
51#include <sys/memrange.h>
52#include <sys/mutex.h>
53#include <sys/pcpu.h>
54#include <sys/proc.h>
55#include <sys/smp.h>
56#include <sys/sysctl.h>
57#include <sys/user.h>
58
59#include <vm/vm.h>
60#include <vm/vm_param.h>
61#include <vm/pmap.h>
62#include <vm/vm_kern.h>
63#include <vm/vm_extern.h>
64#include <vm/vm_map.h>
65
66#include <machine/apic.h>
67#include <machine/atomic.h>
68#include <machine/clock.h>
69#include <machine/cpu.h>
70#include <machine/cpufunc.h>
71#include <machine/mpapic.h>
72#include <machine/psl.h>
73#include <machine/segments.h>
74#include <machine/smp.h>
75#include <machine/smptests.h>	/** TEST_DEFAULT_CONFIG, TEST_TEST1 */
76#include <machine/tss.h>
77#include <machine/specialreg.h>
78#include <machine/privatespace.h>
79
80#if defined(APIC_IO)
81#include <machine/md_var.h>		/* setidt() */
82#include <i386/isa/icu.h>		/* IPIs */
83#include <i386/isa/intr_machdep.h>	/* IPIs */
84#endif	/* APIC_IO */
85
86#if defined(TEST_DEFAULT_CONFIG)
87#define MPFPS_MPFB1	TEST_DEFAULT_CONFIG
88#else
89#define MPFPS_MPFB1	mpfps->mpfb1
90#endif  /* TEST_DEFAULT_CONFIG */
91
92#define WARMBOOT_TARGET		0
93#define WARMBOOT_OFF		(KERNBASE + 0x0467)
94#define WARMBOOT_SEG		(KERNBASE + 0x0469)
95
96#ifdef PC98
97#define BIOS_BASE		(0xe8000)
98#define BIOS_SIZE		(0x18000)
99#else
100#define BIOS_BASE		(0xf0000)
101#define BIOS_SIZE		(0x10000)
102#endif
103#define BIOS_COUNT		(BIOS_SIZE/4)
104
105#define CMOS_REG		(0x70)
106#define CMOS_DATA		(0x71)
107#define BIOS_RESET		(0x0f)
108#define BIOS_WARM		(0x0a)
109
110#define PROCENTRY_FLAG_EN	0x01
111#define PROCENTRY_FLAG_BP	0x02
112#define IOAPICENTRY_FLAG_EN	0x01
113
114
115/* MP Floating Pointer Structure */
116typedef struct MPFPS {
117	char    signature[4];
118	void   *pap;
119	u_char  length;
120	u_char  spec_rev;
121	u_char  checksum;
122	u_char  mpfb1;
123	u_char  mpfb2;
124	u_char  mpfb3;
125	u_char  mpfb4;
126	u_char  mpfb5;
127}      *mpfps_t;
128
129/* MP Configuration Table Header */
130typedef struct MPCTH {
131	char    signature[4];
132	u_short base_table_length;
133	u_char  spec_rev;
134	u_char  checksum;
135	u_char  oem_id[8];
136	u_char  product_id[12];
137	void   *oem_table_pointer;
138	u_short oem_table_size;
139	u_short entry_count;
140	void   *apic_address;
141	u_short extended_table_length;
142	u_char  extended_table_checksum;
143	u_char  reserved;
144}      *mpcth_t;
145
146
147typedef struct PROCENTRY {
148	u_char  type;
149	u_char  apic_id;
150	u_char  apic_version;
151	u_char  cpu_flags;
152	u_long  cpu_signature;
153	u_long  feature_flags;
154	u_long  reserved1;
155	u_long  reserved2;
156}      *proc_entry_ptr;
157
158typedef struct BUSENTRY {
159	u_char  type;
160	u_char  bus_id;
161	char    bus_type[6];
162}      *bus_entry_ptr;
163
164typedef struct IOAPICENTRY {
165	u_char  type;
166	u_char  apic_id;
167	u_char  apic_version;
168	u_char  apic_flags;
169	void   *apic_address;
170}      *io_apic_entry_ptr;
171
172typedef struct INTENTRY {
173	u_char  type;
174	u_char  int_type;
175	u_short int_flags;
176	u_char  src_bus_id;
177	u_char  src_bus_irq;
178	u_char  dst_apic_id;
179	u_char  dst_apic_int;
180}      *int_entry_ptr;
181
182/* descriptions of MP basetable entries */
183typedef struct BASETABLE_ENTRY {
184	u_char  type;
185	u_char  length;
186	char    name[16];
187}       basetable_entry;
188
189/*
190 * this code MUST be enabled here and in mpboot.s.
191 * it follows the very early stages of AP boot by placing values in CMOS ram.
192 * it NORMALLY will never be needed and thus the primitive method for enabling.
193 *
194#define CHECK_POINTS
195 */
196
197#if defined(CHECK_POINTS) && !defined(PC98)
198#define CHECK_READ(A)	 (outb(CMOS_REG, (A)), inb(CMOS_DATA))
199#define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D)))
200
201#define CHECK_INIT(D);				\
202	CHECK_WRITE(0x34, (D));			\
203	CHECK_WRITE(0x35, (D));			\
204	CHECK_WRITE(0x36, (D));			\
205	CHECK_WRITE(0x37, (D));			\
206	CHECK_WRITE(0x38, (D));			\
207	CHECK_WRITE(0x39, (D));
208
209#define CHECK_PRINT(S);				\
210	printf("%s: %d, %d, %d, %d, %d, %d\n",	\
211	   (S),					\
212	   CHECK_READ(0x34),			\
213	   CHECK_READ(0x35),			\
214	   CHECK_READ(0x36),			\
215	   CHECK_READ(0x37),			\
216	   CHECK_READ(0x38),			\
217	   CHECK_READ(0x39));
218
219#else				/* CHECK_POINTS */
220
221#define CHECK_INIT(D)
222#define CHECK_PRINT(S)
223
224#endif				/* CHECK_POINTS */
225
226/*
227 * Values to send to the POST hardware.
228 */
229#define MP_BOOTADDRESS_POST	0x10
230#define MP_PROBE_POST		0x11
231#define MPTABLE_PASS1_POST	0x12
232
233#define MP_START_POST		0x13
234#define MP_ENABLE_POST		0x14
235#define MPTABLE_PASS2_POST	0x15
236
237#define START_ALL_APS_POST	0x16
238#define INSTALL_AP_TRAMP_POST	0x17
239#define START_AP_POST		0x18
240
241#define MP_ANNOUNCE_POST	0x19
242
243static int need_hyperthreading_fixup;
244static u_int logical_cpus;
245static u_int logical_cpus_mask;
246
247/* used to hold the AP's until we are ready to release them */
248static struct mtx ap_boot_mtx;
249
250/** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */
251int	current_postcode;
252
253int	bsp_apic_ready = 0;	/* flags useability of BSP apic */
254int	mp_naps;		/* # of Applications processors */
255int	mp_nbusses;		/* # of busses */
256int	mp_napics;		/* # of IO APICs */
257int	boot_cpu_id;		/* designated BSP */
258vm_offset_t cpu_apic_address;
259vm_offset_t io_apic_address[NAPICID];	/* NAPICID is more than enough */
260extern	int nkpt;
261
262u_int32_t cpu_apic_versions[MAXCPU];
263u_int32_t *io_apic_versions;
264
265#ifdef APIC_INTR_REORDER
266struct {
267	volatile int *location;
268	int bit;
269} apic_isrbit_location[32];
270#endif
271
272struct apic_intmapinfo	int_to_apicintpin[APIC_INTMAPSIZE];
273
274/*
275 * APIC ID logical/physical mapping structures.
276 * We oversize these to simplify boot-time config.
277 */
278int     cpu_num_to_apic_id[NAPICID];
279int     io_num_to_apic_id[NAPICID];
280int     apic_id_to_logical[NAPICID];
281
282/*
283 * CPU topology map datastructures for HTT.
284 */
285struct	cpu_group	mp_groups[NAPICID];
286struct	cpu_top mp_top;
287struct	cpu_top *smp_topology;
288
289
290/* AP uses this during bootstrap.  Do not staticize.  */
291char *bootSTK;
292static int bootAP;
293
294/* Hotwire a 0->4MB V==P mapping */
295extern pt_entry_t *KPTphys;
296
297/* SMP page table page */
298extern pt_entry_t *SMPpt;
299
300struct pcb stoppcbs[MAXCPU];
301
302#ifdef APIC_IO
303/* Variables needed for SMP tlb shootdown. */
304vm_offset_t smp_tlb_addr1;
305vm_offset_t smp_tlb_addr2;
306volatile int smp_tlb_wait;
307static struct mtx smp_tlb_mtx;
308#endif
309
310/*
311 * Local data and functions.
312 */
313
314/* Set to 1 once we're ready to let the APs out of the pen. */
315static volatile int aps_ready = 0;
316
317static int	mp_capable;
318static u_int	boot_address;
319static u_int	base_memory;
320
321static int	picmode;		/* 0: virtual wire mode, 1: PIC mode */
322static mpfps_t	mpfps;
323static int	search_for_sig(u_int32_t target, int count);
324static void	mp_enable(u_int boot_addr);
325
326static void	mptable_hyperthread_fixup(u_int id_mask);
327static void	mptable_pass1(void);
328static int	mptable_pass2(void);
329static void	default_mp_table(int type);
330static void	fix_mp_table(void);
331static void	setup_apic_irq_mapping(void);
332static void	init_locks(void);
333static int	start_all_aps(u_int boot_addr);
334static void	install_ap_tramp(u_int boot_addr);
335static int	start_ap(int logicalCpu, u_int boot_addr);
336void		ap_init(void);
337static int	apic_int_is_bus_type(int intr, int bus_type);
338static void	release_aps(void *dummy);
339
340static int	hlt_cpus_mask;
341static int	hlt_logical_cpus = 1;
342static struct	sysctl_ctx_list logical_cpu_clist;
343
344/*
345 * initialize all the SMP locks
346 */
347
348/* lock region used by kernel profiling */
349int	mcount_lock;
350
351#ifdef USE_COMLOCK
352/* locks com (tty) data/hardware accesses: a FASTINTR() */
353struct mtx		com_mtx;
354#endif /* USE_COMLOCK */
355
356static void
357init_locks(void)
358{
359
360#ifdef USE_COMLOCK
361	mtx_init(&com_mtx, "com", NULL, MTX_SPIN);
362#endif /* USE_COMLOCK */
363#ifdef APIC_IO
364	mtx_init(&smp_tlb_mtx, "tlb", NULL, MTX_SPIN);
365#endif
366}
367
368/*
369 * Calculate usable address in base memory for AP trampoline code.
370 */
371u_int
372mp_bootaddress(u_int basemem)
373{
374	POSTCODE(MP_BOOTADDRESS_POST);
375
376	base_memory = basemem * 1024;	/* convert to bytes */
377
378	boot_address = base_memory & ~0xfff;	/* round down to 4k boundary */
379	if ((base_memory - boot_address) < bootMP_size)
380		boot_address -= 4096;	/* not enough, lower by 4k */
381
382	return boot_address;
383}
384
385
386/*
387 * Look for an Intel MP spec table (ie, SMP capable hardware).
388 */
389void
390i386_mp_probe(void)
391{
392	int     x;
393	u_long  segment;
394	u_int32_t target;
395
396	POSTCODE(MP_PROBE_POST);
397
398	/* see if EBDA exists */
399	if ((segment = (u_long) * (u_short *) (KERNBASE + 0x40e)) != 0) {
400		/* search first 1K of EBDA */
401		target = (u_int32_t) (segment << 4);
402		if ((x = search_for_sig(target, 1024 / 4)) >= 0)
403			goto found;
404	} else {
405		/* last 1K of base memory, effective 'top of base' passed in */
406		target = (u_int32_t) (base_memory - 0x400);
407		if ((x = search_for_sig(target, 1024 / 4)) >= 0)
408			goto found;
409	}
410
411	/* search the BIOS */
412	target = (u_int32_t) BIOS_BASE;
413	if ((x = search_for_sig(target, BIOS_COUNT)) >= 0)
414		goto found;
415
416	/* nothing found */
417	mpfps = (mpfps_t)0;
418	mp_capable = 0;
419	return;
420
421found:
422	/* calculate needed resources */
423	mpfps = (mpfps_t)x;
424	mptable_pass1();
425
426	/* flag fact that we are running multiple processors */
427	mp_capable = 1;
428}
429
430int
431cpu_mp_probe(void)
432{
433	/*
434	 * Record BSP in CPU map
435	 * This is done here so that MBUF init code works correctly.
436	 */
437	all_cpus = 1;
438
439	return (mp_capable);
440}
441
442/*
443 * Initialize the SMP hardware and the APIC and start up the AP's.
444 */
445void
446cpu_mp_start(void)
447{
448	POSTCODE(MP_START_POST);
449
450	/* look for MP capable motherboard */
451	if (mp_capable)
452		mp_enable(boot_address);
453	else
454		panic("MP hardware not found!");
455
456	cpu_setregs();
457}
458
459
460/*
461 * Print various information about the SMP system hardware and setup.
462 */
463void
464cpu_mp_announce(void)
465{
466	int     x;
467
468	POSTCODE(MP_ANNOUNCE_POST);
469
470	printf(" cpu0 (BSP): apic id: %2d", CPU_TO_ID(0));
471	printf(", version: 0x%08x", cpu_apic_versions[0]);
472	printf(", at 0x%08x\n", cpu_apic_address);
473	for (x = 1; x <= mp_naps; ++x) {
474		printf(" cpu%d (AP):  apic id: %2d", x, CPU_TO_ID(x));
475		printf(", version: 0x%08x", cpu_apic_versions[x]);
476		printf(", at 0x%08x\n", cpu_apic_address);
477	}
478
479#if defined(APIC_IO)
480	for (x = 0; x < mp_napics; ++x) {
481		printf(" io%d (APIC): apic id: %2d", x, IO_TO_ID(x));
482		printf(", version: 0x%08x", io_apic_versions[x]);
483		printf(", at 0x%08x\n", io_apic_address[x]);
484	}
485#else
486	printf(" Warning: APIC I/O disabled\n");
487#endif	/* APIC_IO */
488}
489
490/*
491 * AP cpu's call this to sync up protected mode.
492 */
493void
494init_secondary(void)
495{
496	int	gsel_tss;
497	int	x, myid = bootAP;
498	u_int	cr0;
499
500	gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[myid];
501	gdt_segs[GPROC0_SEL].ssd_base =
502		(int) &SMP_prvspace[myid].pcpu.pc_common_tss;
503	SMP_prvspace[myid].pcpu.pc_prvspace =
504		&SMP_prvspace[myid].pcpu;
505
506	for (x = 0; x < NGDT; x++) {
507		ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd);
508	}
509
510	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
511	r_gdt.rd_base = (int) &gdt[myid * NGDT];
512	lgdt(&r_gdt);			/* does magic intra-segment return */
513
514	lidt(&r_idt);
515
516	lldt(_default_ldt);
517	PCPU_SET(currentldt, _default_ldt);
518
519	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
520	gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
521	PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */
522	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
523	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
524	PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd);
525	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
526	ltr(gsel_tss);
527
528	/*
529	 * Set to a known state:
530	 * Set by mpboot.s: CR0_PG, CR0_PE
531	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
532	 */
533	cr0 = rcr0();
534	cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
535	load_cr0(cr0);
536}
537
538
539#if defined(APIC_IO)
540/*
541 * Final configuration of the BSP's local APIC:
542 *  - disable 'pic mode'.
543 *  - disable 'virtual wire mode'.
544 *  - enable NMI.
545 */
546void
547bsp_apic_configure(void)
548{
549	u_char		byte;
550	u_int32_t	temp;
551
552	/* leave 'pic mode' if necessary */
553	if (picmode) {
554		outb(0x22, 0x70);	/* select IMCR */
555		byte = inb(0x23);	/* current contents */
556		byte |= 0x01;		/* mask external INTR */
557		outb(0x23, byte);	/* disconnect 8259s/NMI */
558	}
559
560	/* mask lint0 (the 8259 'virtual wire' connection) */
561	temp = lapic.lvt_lint0;
562	temp |= APIC_LVT_M;		/* set the mask */
563	lapic.lvt_lint0 = temp;
564
565        /* setup lint1 to handle NMI */
566        temp = lapic.lvt_lint1;
567        temp &= ~APIC_LVT_M;		/* clear the mask */
568        lapic.lvt_lint1 = temp;
569
570	if (bootverbose)
571		apic_dump("bsp_apic_configure()");
572}
573#endif  /* APIC_IO */
574
575
576/*******************************************************************
577 * local functions and data
578 */
579
580/*
581 * start the SMP system
582 */
583static void
584mp_enable(u_int boot_addr)
585{
586	int     x;
587#if defined(APIC_IO)
588	int     apic;
589	u_int   ux;
590#endif	/* APIC_IO */
591
592	POSTCODE(MP_ENABLE_POST);
593
594	/* turn on 4MB of V == P addressing so we can get to MP table */
595	*(int *)PTD = PG_V | PG_RW | ((uintptr_t)(void *)KPTphys & PG_FRAME);
596	invltlb();
597
598	/* examine the MP table for needed info, uses physical addresses */
599	x = mptable_pass2();
600
601	*(int *)PTD = 0;
602	invltlb();
603
604	/* can't process default configs till the CPU APIC is pmapped */
605	if (x)
606		default_mp_table(x);
607
608	/* post scan cleanup */
609	fix_mp_table();
610	setup_apic_irq_mapping();
611
612#if defined(APIC_IO)
613
614	/* fill the LOGICAL io_apic_versions table */
615	for (apic = 0; apic < mp_napics; ++apic) {
616		ux = io_apic_read(apic, IOAPIC_VER);
617		io_apic_versions[apic] = ux;
618		io_apic_set_id(apic, IO_TO_ID(apic));
619	}
620
621	/* program each IO APIC in the system */
622	for (apic = 0; apic < mp_napics; ++apic)
623		if (io_apic_setup(apic) < 0)
624			panic("IO APIC setup failure");
625
626	/* install a 'Spurious INTerrupt' vector */
627	setidt(XSPURIOUSINT_OFFSET, Xspuriousint,
628	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
629
630	/* install an inter-CPU IPI for TLB invalidation */
631	setidt(XINVLTLB_OFFSET, Xinvltlb,
632	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
633	setidt(XINVLPG_OFFSET, Xinvlpg,
634	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
635	setidt(XINVLRNG_OFFSET, Xinvlrng,
636	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
637
638	/* install an inter-CPU IPI for forwarding hardclock() */
639	setidt(XHARDCLOCK_OFFSET, Xhardclock,
640	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
641
642	/* install an inter-CPU IPI for forwarding statclock() */
643	setidt(XSTATCLOCK_OFFSET, Xstatclock,
644	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
645
646	/* install an inter-CPU IPI for lazy pmap release */
647	setidt(XLAZYPMAP_OFFSET, Xlazypmap,
648	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
649
650	/* install an inter-CPU IPI for all-CPU rendezvous */
651	setidt(XRENDEZVOUS_OFFSET, Xrendezvous,
652	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
653
654	/* install an inter-CPU IPI for forcing an additional software trap */
655	setidt(XCPUAST_OFFSET, Xcpuast,
656	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
657
658	/* install an inter-CPU IPI for CPU stop/restart */
659	setidt(XCPUSTOP_OFFSET, Xcpustop,
660	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
661
662#if defined(TEST_TEST1)
663	/* install a "fake hardware INTerrupt" vector */
664	setidt(XTEST1_OFFSET, Xtest1,
665	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
666#endif  /** TEST_TEST1 */
667
668#endif	/* APIC_IO */
669
670	/* initialize all SMP locks */
671	init_locks();
672
673	/* start each Application Processor */
674	start_all_aps(boot_addr);
675}
676
677
678/*
679 * look for the MP spec signature
680 */
681
682/* string defined by the Intel MP Spec as identifying the MP table */
683#define MP_SIG		0x5f504d5f	/* _MP_ */
684#define NEXT(X)		((X) += 4)
685static int
686search_for_sig(u_int32_t target, int count)
687{
688	int     x;
689	u_int32_t *addr = (u_int32_t *) (KERNBASE + target);
690
691	for (x = 0; x < count; NEXT(x))
692		if (addr[x] == MP_SIG)
693			/* make array index a byte index */
694			return (target + (x * sizeof(u_int32_t)));
695
696	return -1;
697}
698
699
700static basetable_entry basetable_entry_types[] =
701{
702	{0, 20, "Processor"},
703	{1, 8, "Bus"},
704	{2, 8, "I/O APIC"},
705	{3, 8, "I/O INT"},
706	{4, 8, "Local INT"}
707};
708
709typedef struct BUSDATA {
710	u_char  bus_id;
711	enum busTypes bus_type;
712}       bus_datum;
713
714typedef struct INTDATA {
715	u_char  int_type;
716	u_short int_flags;
717	u_char  src_bus_id;
718	u_char  src_bus_irq;
719	u_char  dst_apic_id;
720	u_char  dst_apic_int;
721	u_char	int_vector;
722}       io_int, local_int;
723
724typedef struct BUSTYPENAME {
725	u_char  type;
726	char    name[7];
727}       bus_type_name;
728
729static bus_type_name bus_type_table[] =
730{
731	{CBUS, "CBUS"},
732	{CBUSII, "CBUSII"},
733	{EISA, "EISA"},
734	{MCA, "MCA"},
735	{UNKNOWN_BUSTYPE, "---"},
736	{ISA, "ISA"},
737	{MCA, "MCA"},
738	{UNKNOWN_BUSTYPE, "---"},
739	{UNKNOWN_BUSTYPE, "---"},
740	{UNKNOWN_BUSTYPE, "---"},
741	{UNKNOWN_BUSTYPE, "---"},
742	{UNKNOWN_BUSTYPE, "---"},
743	{PCI, "PCI"},
744	{UNKNOWN_BUSTYPE, "---"},
745	{UNKNOWN_BUSTYPE, "---"},
746	{UNKNOWN_BUSTYPE, "---"},
747	{UNKNOWN_BUSTYPE, "---"},
748	{XPRESS, "XPRESS"},
749	{UNKNOWN_BUSTYPE, "---"}
750};
751/* from MP spec v1.4, table 5-1 */
752static int default_data[7][5] =
753{
754/*   nbus, id0, type0, id1, type1 */
755	{1, 0, ISA, 255, 255},
756	{1, 0, EISA, 255, 255},
757	{1, 0, EISA, 255, 255},
758	{1, 0, MCA, 255, 255},
759	{2, 0, ISA, 1, PCI},
760	{2, 0, EISA, 1, PCI},
761	{2, 0, MCA, 1, PCI}
762};
763
764
765/* the bus data */
766static bus_datum *bus_data;
767
768/* the IO INT data, one entry per possible APIC INTerrupt */
769static io_int  *io_apic_ints;
770
771static int nintrs;
772
773static int processor_entry(proc_entry_ptr entry, int cpu);
774static int bus_entry(bus_entry_ptr entry, int bus);
775static int io_apic_entry(io_apic_entry_ptr entry, int apic);
776static int int_entry(int_entry_ptr entry, int intr);
777static int lookup_bus_type(char *name);
778
779
780/*
781 * 1st pass on motherboard's Intel MP specification table.
782 *
783 * initializes:
784 *	mp_ncpus = 1
785 *
786 * determines:
787 *	cpu_apic_address (common to all CPUs)
788 *	io_apic_address[N]
789 *	mp_naps
790 *	mp_nbusses
791 *	mp_napics
792 *	nintrs
793 */
794static void
795mptable_pass1(void)
796{
797	int	x;
798	mpcth_t	cth;
799	int	totalSize;
800	void*	position;
801	int	count;
802	int	type;
803	u_int	id_mask;
804
805	POSTCODE(MPTABLE_PASS1_POST);
806
807	/* clear various tables */
808	for (x = 0; x < NAPICID; ++x) {
809		io_apic_address[x] = ~0;	/* IO APIC address table */
810	}
811
812	/* init everything to empty */
813	mp_naps = 0;
814	mp_nbusses = 0;
815	mp_napics = 0;
816	nintrs = 0;
817	id_mask = 0;
818
819	/* check for use of 'default' configuration */
820	if (MPFPS_MPFB1 != 0) {
821		/* use default addresses */
822		cpu_apic_address = DEFAULT_APIC_BASE;
823		io_apic_address[0] = DEFAULT_IO_APIC_BASE;
824
825		/* fill in with defaults */
826		mp_naps = 2;		/* includes BSP */
827		mp_maxid = 1;
828		mp_nbusses = default_data[MPFPS_MPFB1 - 1][0];
829#if defined(APIC_IO)
830		mp_napics = 1;
831		nintrs = 16;
832#endif	/* APIC_IO */
833	}
834	else {
835		if ((cth = mpfps->pap) == 0)
836			panic("MP Configuration Table Header MISSING!");
837
838		cpu_apic_address = (vm_offset_t) cth->apic_address;
839
840		/* walk the table, recording info of interest */
841		totalSize = cth->base_table_length - sizeof(struct MPCTH);
842		position = (u_char *) cth + sizeof(struct MPCTH);
843		count = cth->entry_count;
844
845		while (count--) {
846			switch (type = *(u_char *) position) {
847			case 0: /* processor_entry */
848				if (((proc_entry_ptr)position)->cpu_flags
849				    & PROCENTRY_FLAG_EN) {
850					++mp_naps;
851					mp_maxid++;
852					id_mask |= 1 <<
853					    ((proc_entry_ptr)position)->apic_id;
854				}
855				break;
856			case 1: /* bus_entry */
857				++mp_nbusses;
858				break;
859			case 2: /* io_apic_entry */
860				if (((io_apic_entry_ptr)position)->apic_flags
861					& IOAPICENTRY_FLAG_EN)
862					io_apic_address[mp_napics++] =
863					    (vm_offset_t)((io_apic_entry_ptr)
864						position)->apic_address;
865				break;
866			case 3: /* int_entry */
867				++nintrs;
868				break;
869			case 4:	/* int_entry */
870				break;
871			default:
872				panic("mpfps Base Table HOSED!");
873				/* NOTREACHED */
874			}
875
876			totalSize -= basetable_entry_types[type].length;
877			(u_char*)position += basetable_entry_types[type].length;
878		}
879	}
880
881	/* qualify the numbers */
882	if (mp_naps > MAXCPU) {
883		printf("Warning: only using %d of %d available CPUs!\n",
884			MAXCPU, mp_naps);
885		mp_naps = MAXCPU;
886	}
887
888	/* See if we need to fixup HT logical CPUs. */
889	mptable_hyperthread_fixup(id_mask);
890
891	/*
892	 * Count the BSP.
893	 * This is also used as a counter while starting the APs.
894	 */
895	mp_ncpus = 1;
896
897	--mp_naps;	/* subtract the BSP */
898}
899
900
901/*
902 * 2nd pass on motherboard's Intel MP specification table.
903 *
904 * sets:
905 *	boot_cpu_id
906 *	ID_TO_IO(N), phy APIC ID to log CPU/IO table
907 *	CPU_TO_ID(N), logical CPU to APIC ID table
908 *	IO_TO_ID(N), logical IO to APIC ID table
909 *	bus_data[N]
910 *	io_apic_ints[N]
911 */
912static int
913mptable_pass2(void)
914{
915	struct PROCENTRY proc;
916	int     x;
917	mpcth_t cth;
918	int     totalSize;
919	void*   position;
920	int     count;
921	int     type;
922	int     apic, bus, cpu, intr;
923	int	i, j;
924
925	POSTCODE(MPTABLE_PASS2_POST);
926
927	/* Initialize fake proc entry for use with HT fixup. */
928	bzero(&proc, sizeof(proc));
929	proc.type = 0;
930	proc.cpu_flags = PROCENTRY_FLAG_EN;
931
932	MALLOC(io_apic_versions, u_int32_t *, sizeof(u_int32_t) * mp_napics,
933	    M_DEVBUF, M_WAITOK);
934	MALLOC(ioapic, volatile ioapic_t **, sizeof(ioapic_t *) * mp_napics,
935	    M_DEVBUF, M_WAITOK);
936	MALLOC(io_apic_ints, io_int *, sizeof(io_int) * (nintrs + 1),
937	    M_DEVBUF, M_WAITOK);
938	MALLOC(bus_data, bus_datum *, sizeof(bus_datum) * mp_nbusses,
939	    M_DEVBUF, M_WAITOK);
940
941	bzero(ioapic, sizeof(ioapic_t *) * mp_napics);
942
943	for (i = 0; i < mp_napics; i++) {
944		for (j = 0; j < mp_napics; j++) {
945			/* same page frame as a previous IO apic? */
946			if (((vm_offset_t)SMPpt[NPTEPG-2-j] & PG_FRAME) ==
947			    (io_apic_address[i] & PG_FRAME)) {
948				ioapic[i] = (ioapic_t *)((u_int)SMP_prvspace
949					+ (NPTEPG-2-j) * PAGE_SIZE
950					+ (io_apic_address[i] & PAGE_MASK));
951				break;
952			}
953			/* use this slot if available */
954			if (((vm_offset_t)SMPpt[NPTEPG-2-j] & PG_FRAME) == 0) {
955				SMPpt[NPTEPG-2-j] = (pt_entry_t)(PG_V | PG_RW |
956				    (io_apic_address[i] & PG_FRAME));
957				ioapic[i] = (ioapic_t *)((u_int)SMP_prvspace
958					+ (NPTEPG-2-j) * PAGE_SIZE
959					+ (io_apic_address[i] & PAGE_MASK));
960				break;
961			}
962		}
963	}
964
965	/* clear various tables */
966	for (x = 0; x < NAPICID; ++x) {
967		ID_TO_IO(x) = -1;	/* phy APIC ID to log CPU/IO table */
968		CPU_TO_ID(x) = -1;	/* logical CPU to APIC ID table */
969		IO_TO_ID(x) = -1;	/* logical IO to APIC ID table */
970	}
971
972	/* clear bus data table */
973	for (x = 0; x < mp_nbusses; ++x)
974		bus_data[x].bus_id = 0xff;
975
976	/* clear IO APIC INT table */
977	for (x = 0; x < (nintrs + 1); ++x) {
978		io_apic_ints[x].int_type = 0xff;
979		io_apic_ints[x].int_vector = 0xff;
980	}
981
982	/* setup the cpu/apic mapping arrays */
983	boot_cpu_id = -1;
984
985	/* record whether PIC or virtual-wire mode */
986	picmode = (mpfps->mpfb2 & 0x80) ? 1 : 0;
987
988	/* check for use of 'default' configuration */
989	if (MPFPS_MPFB1 != 0)
990		return MPFPS_MPFB1;	/* return default configuration type */
991
992	if ((cth = mpfps->pap) == 0)
993		panic("MP Configuration Table Header MISSING!");
994
995	/* walk the table, recording info of interest */
996	totalSize = cth->base_table_length - sizeof(struct MPCTH);
997	position = (u_char *) cth + sizeof(struct MPCTH);
998	count = cth->entry_count;
999	apic = bus = intr = 0;
1000	cpu = 1;				/* pre-count the BSP */
1001
1002	while (count--) {
1003		switch (type = *(u_char *) position) {
1004		case 0:
1005			if (processor_entry(position, cpu)) {
1006				if (logical_cpus != 0 &&
1007				    cpu % logical_cpus != 0)
1008					logical_cpus_mask |= (1 << cpu);
1009				++cpu;
1010			}
1011			if (need_hyperthreading_fixup) {
1012				/*
1013				 * Create fake mptable processor entries
1014				 * and feed them to processor_entry() to
1015				 * enumerate the logical CPUs.
1016				 */
1017				proc.apic_id = ((proc_entry_ptr)position)->apic_id;
1018				for (i = 1; i < logical_cpus; i++) {
1019					proc.apic_id++;
1020					(void)processor_entry(&proc, cpu);
1021					logical_cpus_mask |= (1 << cpu);
1022					cpu++;
1023				}
1024			}
1025			break;
1026		case 1:
1027			if (bus_entry(position, bus))
1028				++bus;
1029			break;
1030		case 2:
1031			if (io_apic_entry(position, apic))
1032				++apic;
1033			break;
1034		case 3:
1035			if (int_entry(position, intr))
1036				++intr;
1037			break;
1038		case 4:
1039			/* int_entry(position); */
1040			break;
1041		default:
1042			panic("mpfps Base Table HOSED!");
1043			/* NOTREACHED */
1044		}
1045
1046		totalSize -= basetable_entry_types[type].length;
1047		(u_char *) position += basetable_entry_types[type].length;
1048	}
1049
1050	if (boot_cpu_id == -1)
1051		panic("NO BSP found!");
1052
1053	/* report fact that its NOT a default configuration */
1054	return 0;
1055}
1056
1057/*
1058 * Check if we should perform a hyperthreading "fix-up" to
1059 * enumerate any logical CPU's that aren't already listed
1060 * in the table.
1061 *
1062 * XXX: We assume that all of the physical CPUs in the
1063 * system have the same number of logical CPUs.
1064 *
1065 * XXX: We assume that APIC ID's are allocated such that
1066 * the APIC ID's for a physical processor are aligned
1067 * with the number of logical CPU's in the processor.
1068 */
1069static void
1070mptable_hyperthread_fixup(u_int id_mask)
1071{
1072	u_int i, id;
1073	int logical;
1074
1075	/* Nothing to do if there is no HTT support. */
1076	if ((cpu_feature & CPUID_HTT) == 0)
1077		return;
1078	logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
1079	if (logical_cpus <= 1)
1080		return;
1081
1082	/*
1083	 * For each APIC ID of a CPU that is set in the mask,
1084	 * scan the other candidate APIC ID's for this
1085	 * physical processor.  If any of those ID's are
1086	 * already in the table, then kill the fixup.
1087	 */
1088	for (id = 0; id <= MAXCPU; id++) {
1089		if ((id_mask & 1 << id) == 0)
1090			continue;
1091		/* First, make sure we are on a logical_cpus boundary. */
1092		if (id % logical_cpus != 0)
1093			return;
1094		for (i = id + 1; i < id + logical_cpus; i++)
1095			if ((id_mask & 1 << i) != 0)
1096				return;
1097	}
1098
1099	/*
1100	 * Ok, the ID's checked out, so enable the fixup.  We have to fixup
1101	 * mp_naps and mp_maxid right now.
1102	 */
1103	need_hyperthreading_fixup = 1;
1104	mp_maxid *= logical_cpus;
1105	mp_naps *= logical_cpus;
1106
1107	/*
1108	 * Now setup the cpu topology map.
1109	 */
1110	mp_top.ct_count = mp_naps / logical_cpus;
1111	mp_top.ct_group = mp_groups;
1112
1113	/*
1114	 * The first logical id is directly after the last valid physical id.
1115	 */
1116	logical = mp_top.ct_count + 1;
1117
1118	for (i = 0; i < mp_top.ct_count; i++) {
1119		int j;
1120
1121		mp_groups[i].cg_mask = (1 << i);
1122		for (j = 1; j < logical_cpus; j++)
1123			mp_groups[i].cg_mask |= (1 << logical++);
1124		mp_groups[i].cg_count = logical_cpus;
1125		mp_groups[i].cg_children = 0;
1126	}
1127
1128	smp_topology = &mp_top;
1129}
1130
1131void
1132assign_apic_irq(int apic, int intpin, int irq)
1133{
1134	int x;
1135
1136	if (int_to_apicintpin[irq].ioapic != -1)
1137		panic("assign_apic_irq: inconsistent table");
1138
1139	int_to_apicintpin[irq].ioapic = apic;
1140	int_to_apicintpin[irq].int_pin = intpin;
1141	int_to_apicintpin[irq].apic_address = ioapic[apic];
1142	int_to_apicintpin[irq].redirindex = IOAPIC_REDTBL + 2 * intpin;
1143
1144	for (x = 0; x < nintrs; x++) {
1145		if ((io_apic_ints[x].int_type == 0 ||
1146		     io_apic_ints[x].int_type == 3) &&
1147		    io_apic_ints[x].int_vector == 0xff &&
1148		    io_apic_ints[x].dst_apic_id == IO_TO_ID(apic) &&
1149		    io_apic_ints[x].dst_apic_int == intpin)
1150			io_apic_ints[x].int_vector = irq;
1151	}
1152}
1153
1154void
1155revoke_apic_irq(int irq)
1156{
1157	int x;
1158	int oldapic;
1159	int oldintpin;
1160
1161	if (int_to_apicintpin[irq].ioapic == -1)
1162		panic("revoke_apic_irq: inconsistent table");
1163
1164	oldapic = int_to_apicintpin[irq].ioapic;
1165	oldintpin = int_to_apicintpin[irq].int_pin;
1166
1167	int_to_apicintpin[irq].ioapic = -1;
1168	int_to_apicintpin[irq].int_pin = 0;
1169	int_to_apicintpin[irq].apic_address = NULL;
1170	int_to_apicintpin[irq].redirindex = 0;
1171
1172	for (x = 0; x < nintrs; x++) {
1173		if ((io_apic_ints[x].int_type == 0 ||
1174		     io_apic_ints[x].int_type == 3) &&
1175		    io_apic_ints[x].int_vector != 0xff &&
1176		    io_apic_ints[x].dst_apic_id == IO_TO_ID(oldapic) &&
1177		    io_apic_ints[x].dst_apic_int == oldintpin)
1178			io_apic_ints[x].int_vector = 0xff;
1179	}
1180}
1181
1182
1183static void
1184allocate_apic_irq(int intr)
1185{
1186	int apic;
1187	int intpin;
1188	int irq;
1189
1190	if (io_apic_ints[intr].int_vector != 0xff)
1191		return;		/* Interrupt handler already assigned */
1192
1193	if (io_apic_ints[intr].int_type != 0 &&
1194	    (io_apic_ints[intr].int_type != 3 ||
1195	     (io_apic_ints[intr].dst_apic_id == IO_TO_ID(0) &&
1196	      io_apic_ints[intr].dst_apic_int == 0)))
1197		return;		/* Not INT or ExtInt on != (0, 0) */
1198
1199	irq = 0;
1200	while (irq < APIC_INTMAPSIZE &&
1201	       int_to_apicintpin[irq].ioapic != -1)
1202		irq++;
1203
1204	if (irq >= APIC_INTMAPSIZE)
1205		return;		/* No free interrupt handlers */
1206
1207	apic = ID_TO_IO(io_apic_ints[intr].dst_apic_id);
1208	intpin = io_apic_ints[intr].dst_apic_int;
1209
1210	assign_apic_irq(apic, intpin, irq);
1211	io_apic_setup_intpin(apic, intpin);
1212}
1213
1214
1215static void
1216swap_apic_id(int apic, int oldid, int newid)
1217{
1218	int x;
1219	int oapic;
1220
1221
1222	if (oldid == newid)
1223		return;			/* Nothing to do */
1224
1225	printf("Changing APIC ID for IO APIC #%d from %d to %d in MP table\n",
1226	       apic, oldid, newid);
1227
1228	/* Swap physical APIC IDs in interrupt entries */
1229	for (x = 0; x < nintrs; x++) {
1230		if (io_apic_ints[x].dst_apic_id == oldid)
1231			io_apic_ints[x].dst_apic_id = newid;
1232		else if (io_apic_ints[x].dst_apic_id == newid)
1233			io_apic_ints[x].dst_apic_id = oldid;
1234	}
1235
1236	/* Swap physical APIC IDs in IO_TO_ID mappings */
1237	for (oapic = 0; oapic < mp_napics; oapic++)
1238		if (IO_TO_ID(oapic) == newid)
1239			break;
1240
1241	if (oapic < mp_napics) {
1242		printf("Changing APIC ID for IO APIC #%d from "
1243		       "%d to %d in MP table\n",
1244		       oapic, newid, oldid);
1245		IO_TO_ID(oapic) = oldid;
1246	}
1247	IO_TO_ID(apic) = newid;
1248}
1249
1250
1251static void
1252fix_id_to_io_mapping(void)
1253{
1254	int x;
1255
1256	for (x = 0; x < NAPICID; x++)
1257		ID_TO_IO(x) = -1;
1258
1259	for (x = 0; x <= mp_naps; x++)
1260		if (CPU_TO_ID(x) < NAPICID)
1261			ID_TO_IO(CPU_TO_ID(x)) = x;
1262
1263	for (x = 0; x < mp_napics; x++)
1264		if (IO_TO_ID(x) < NAPICID)
1265			ID_TO_IO(IO_TO_ID(x)) = x;
1266}
1267
1268
1269static int
1270first_free_apic_id(void)
1271{
1272	int freeid, x;
1273
1274	for (freeid = 0; freeid < NAPICID; freeid++) {
1275		for (x = 0; x <= mp_naps; x++)
1276			if (CPU_TO_ID(x) == freeid)
1277				break;
1278		if (x <= mp_naps)
1279			continue;
1280		for (x = 0; x < mp_napics; x++)
1281			if (IO_TO_ID(x) == freeid)
1282				break;
1283		if (x < mp_napics)
1284			continue;
1285		return freeid;
1286	}
1287	return freeid;
1288}
1289
1290
1291static int
1292io_apic_id_acceptable(int apic, int id)
1293{
1294	int cpu;		/* Logical CPU number */
1295	int oapic;		/* Logical IO APIC number for other IO APIC */
1296
1297	if (id >= NAPICID)
1298		return 0;	/* Out of range */
1299
1300	for (cpu = 0; cpu <= mp_naps; cpu++)
1301		if (CPU_TO_ID(cpu) == id)
1302			return 0;	/* Conflict with CPU */
1303
1304	for (oapic = 0; oapic < mp_napics && oapic < apic; oapic++)
1305		if (IO_TO_ID(oapic) == id)
1306			return 0;	/* Conflict with other APIC */
1307
1308	return 1;		/* ID is acceptable for IO APIC */
1309}
1310
1311
1312/*
1313 * parse an Intel MP specification table
1314 */
1315static void
1316fix_mp_table(void)
1317{
1318	int	x;
1319	int	id;
1320	int	bus_0 = 0;	/* Stop GCC warning */
1321	int	bus_pci = 0;	/* Stop GCC warning */
1322	int	num_pci_bus;
1323	int	apic;		/* IO APIC unit number */
1324	int     freeid;		/* Free physical APIC ID */
1325	int	physid;		/* Current physical IO APIC ID */
1326
1327	/*
1328	 * Fix mis-numbering of the PCI bus and its INT entries if the BIOS
1329	 * did it wrong.  The MP spec says that when more than 1 PCI bus
1330	 * exists the BIOS must begin with bus entries for the PCI bus and use
1331	 * actual PCI bus numbering.  This implies that when only 1 PCI bus
1332	 * exists the BIOS can choose to ignore this ordering, and indeed many
1333	 * MP motherboards do ignore it.  This causes a problem when the PCI
1334	 * sub-system makes requests of the MP sub-system based on PCI bus
1335	 * numbers.	So here we look for the situation and renumber the
1336	 * busses and associated INTs in an effort to "make it right".
1337	 */
1338
1339	/* find bus 0, PCI bus, count the number of PCI busses */
1340	for (num_pci_bus = 0, x = 0; x < mp_nbusses; ++x) {
1341		if (bus_data[x].bus_id == 0) {
1342			bus_0 = x;
1343		}
1344		if (bus_data[x].bus_type == PCI) {
1345			++num_pci_bus;
1346			bus_pci = x;
1347		}
1348	}
1349	/*
1350	 * bus_0 == slot of bus with ID of 0
1351	 * bus_pci == slot of last PCI bus encountered
1352	 */
1353
1354	/* check the 1 PCI bus case for sanity */
1355	/* if it is number 0 all is well */
1356	if (num_pci_bus == 1 &&
1357	    bus_data[bus_pci].bus_id != 0) {
1358
1359		/* mis-numbered, swap with whichever bus uses slot 0 */
1360
1361		/* swap the bus entry types */
1362		bus_data[bus_pci].bus_type = bus_data[bus_0].bus_type;
1363		bus_data[bus_0].bus_type = PCI;
1364
1365		/* swap each relavant INTerrupt entry */
1366		id = bus_data[bus_pci].bus_id;
1367		for (x = 0; x < nintrs; ++x) {
1368			if (io_apic_ints[x].src_bus_id == id) {
1369				io_apic_ints[x].src_bus_id = 0;
1370			}
1371			else if (io_apic_ints[x].src_bus_id == 0) {
1372				io_apic_ints[x].src_bus_id = id;
1373			}
1374		}
1375	}
1376
1377	/* Assign IO APIC IDs.
1378	 *
1379	 * First try the existing ID. If a conflict is detected, try
1380	 * the ID in the MP table.  If a conflict is still detected, find
1381	 * a free id.
1382	 *
1383	 * We cannot use the ID_TO_IO table before all conflicts has been
1384	 * resolved and the table has been corrected.
1385	 */
1386	for (apic = 0; apic < mp_napics; ++apic) { /* For all IO APICs */
1387
1388		/* First try to use the value set by the BIOS */
1389		physid = io_apic_get_id(apic);
1390		if (io_apic_id_acceptable(apic, physid)) {
1391			if (IO_TO_ID(apic) != physid)
1392				swap_apic_id(apic, IO_TO_ID(apic), physid);
1393			continue;
1394		}
1395
1396		/* Then check if the value in the MP table is acceptable */
1397		if (io_apic_id_acceptable(apic, IO_TO_ID(apic)))
1398			continue;
1399
1400		/* Last resort, find a free APIC ID and use it */
1401		freeid = first_free_apic_id();
1402		if (freeid >= NAPICID)
1403			panic("No free physical APIC IDs found");
1404
1405		if (io_apic_id_acceptable(apic, freeid)) {
1406			swap_apic_id(apic, IO_TO_ID(apic), freeid);
1407			continue;
1408		}
1409		panic("Free physical APIC ID not usable");
1410	}
1411	fix_id_to_io_mapping();
1412
1413	/* detect and fix broken Compaq MP table */
1414	if (apic_int_type(0, 0) == -1) {
1415		printf("APIC_IO: MP table broken: 8259->APIC entry missing!\n");
1416		io_apic_ints[nintrs].int_type = 3;	/* ExtInt */
1417		io_apic_ints[nintrs].int_vector = 0xff;	/* Unassigned */
1418		/* XXX fixme, set src bus id etc, but it doesn't seem to hurt */
1419		io_apic_ints[nintrs].dst_apic_id = IO_TO_ID(0);
1420		io_apic_ints[nintrs].dst_apic_int = 0;	/* Pin 0 */
1421		nintrs++;
1422	}
1423}
1424
1425
1426/* Assign low level interrupt handlers */
1427static void
1428setup_apic_irq_mapping(void)
1429{
1430	int	x;
1431	int	int_vector;
1432
1433	/* Clear array */
1434	for (x = 0; x < APIC_INTMAPSIZE; x++) {
1435		int_to_apicintpin[x].ioapic = -1;
1436		int_to_apicintpin[x].int_pin = 0;
1437		int_to_apicintpin[x].apic_address = NULL;
1438		int_to_apicintpin[x].redirindex = 0;
1439	}
1440
1441	/* First assign ISA/EISA interrupts */
1442	for (x = 0; x < nintrs; x++) {
1443		int_vector = io_apic_ints[x].src_bus_irq;
1444		if (int_vector < APIC_INTMAPSIZE &&
1445		    io_apic_ints[x].int_vector == 0xff &&
1446		    int_to_apicintpin[int_vector].ioapic == -1 &&
1447		    (apic_int_is_bus_type(x, ISA) ||
1448		     apic_int_is_bus_type(x, EISA)) &&
1449		    io_apic_ints[x].int_type == 0) {
1450			assign_apic_irq(ID_TO_IO(io_apic_ints[x].dst_apic_id),
1451					io_apic_ints[x].dst_apic_int,
1452					int_vector);
1453		}
1454	}
1455
1456	/* Assign ExtInt entry if no ISA/EISA interrupt 0 entry */
1457	for (x = 0; x < nintrs; x++) {
1458		if (io_apic_ints[x].dst_apic_int == 0 &&
1459		    io_apic_ints[x].dst_apic_id == IO_TO_ID(0) &&
1460		    io_apic_ints[x].int_vector == 0xff &&
1461		    int_to_apicintpin[0].ioapic == -1 &&
1462		    io_apic_ints[x].int_type == 3) {
1463			assign_apic_irq(0, 0, 0);
1464			break;
1465		}
1466	}
1467	/* PCI interrupt assignment is deferred */
1468}
1469
1470
1471static int
1472processor_entry(proc_entry_ptr entry, int cpu)
1473{
1474	/* check for usability */
1475	if (!(entry->cpu_flags & PROCENTRY_FLAG_EN))
1476		return 0;
1477
1478	if(entry->apic_id >= NAPICID)
1479		panic("CPU APIC ID out of range (0..%d)", NAPICID - 1);
1480	/* check for BSP flag */
1481	if (entry->cpu_flags & PROCENTRY_FLAG_BP) {
1482		boot_cpu_id = entry->apic_id;
1483		CPU_TO_ID(0) = entry->apic_id;
1484		ID_TO_CPU(entry->apic_id) = 0;
1485		return 0;	/* its already been counted */
1486	}
1487
1488	/* add another AP to list, if less than max number of CPUs */
1489	else if (cpu < MAXCPU) {
1490		CPU_TO_ID(cpu) = entry->apic_id;
1491		ID_TO_CPU(entry->apic_id) = cpu;
1492		return 1;
1493	}
1494
1495	return 0;
1496}
1497
1498
1499static int
1500bus_entry(bus_entry_ptr entry, int bus)
1501{
1502	int     x;
1503	char    c, name[8];
1504
1505	/* encode the name into an index */
1506	for (x = 0; x < 6; ++x) {
1507		if ((c = entry->bus_type[x]) == ' ')
1508			break;
1509		name[x] = c;
1510	}
1511	name[x] = '\0';
1512
1513	if ((x = lookup_bus_type(name)) == UNKNOWN_BUSTYPE)
1514		panic("unknown bus type: '%s'", name);
1515
1516	bus_data[bus].bus_id = entry->bus_id;
1517	bus_data[bus].bus_type = x;
1518
1519	return 1;
1520}
1521
1522
1523static int
1524io_apic_entry(io_apic_entry_ptr entry, int apic)
1525{
1526	if (!(entry->apic_flags & IOAPICENTRY_FLAG_EN))
1527		return 0;
1528
1529	IO_TO_ID(apic) = entry->apic_id;
1530	if (entry->apic_id < NAPICID)
1531		ID_TO_IO(entry->apic_id) = apic;
1532
1533	return 1;
1534}
1535
1536
1537static int
1538lookup_bus_type(char *name)
1539{
1540	int     x;
1541
1542	for (x = 0; x < MAX_BUSTYPE; ++x)
1543		if (strcmp(bus_type_table[x].name, name) == 0)
1544			return bus_type_table[x].type;
1545
1546	return UNKNOWN_BUSTYPE;
1547}
1548
1549
1550static int
1551int_entry(int_entry_ptr entry, int intr)
1552{
1553	int apic;
1554
1555	io_apic_ints[intr].int_type = entry->int_type;
1556	io_apic_ints[intr].int_flags = entry->int_flags;
1557	io_apic_ints[intr].src_bus_id = entry->src_bus_id;
1558	io_apic_ints[intr].src_bus_irq = entry->src_bus_irq;
1559	if (entry->dst_apic_id == 255) {
1560		/* This signal goes to all IO APICS.  Select an IO APIC
1561		   with sufficient number of interrupt pins */
1562		for (apic = 0; apic < mp_napics; apic++)
1563			if (((io_apic_read(apic, IOAPIC_VER) &
1564			      IOART_VER_MAXREDIR) >> MAXREDIRSHIFT) >=
1565			    entry->dst_apic_int)
1566				break;
1567		if (apic < mp_napics)
1568			io_apic_ints[intr].dst_apic_id = IO_TO_ID(apic);
1569		else
1570			io_apic_ints[intr].dst_apic_id = entry->dst_apic_id;
1571	} else
1572		io_apic_ints[intr].dst_apic_id = entry->dst_apic_id;
1573	io_apic_ints[intr].dst_apic_int = entry->dst_apic_int;
1574
1575	return 1;
1576}
1577
1578
1579static int
1580apic_int_is_bus_type(int intr, int bus_type)
1581{
1582	int     bus;
1583
1584	for (bus = 0; bus < mp_nbusses; ++bus)
1585		if ((bus_data[bus].bus_id == io_apic_ints[intr].src_bus_id)
1586		    && ((int) bus_data[bus].bus_type == bus_type))
1587			return 1;
1588
1589	return 0;
1590}
1591
1592
1593/*
1594 * Given a traditional ISA INT mask, return an APIC mask.
1595 */
1596u_int
1597isa_apic_mask(u_int isa_mask)
1598{
1599	int isa_irq;
1600	int apic_pin;
1601
1602#if defined(SKIP_IRQ15_REDIRECT)
1603	if (isa_mask == (1 << 15)) {
1604		printf("skipping ISA IRQ15 redirect\n");
1605		return isa_mask;
1606	}
1607#endif  /* SKIP_IRQ15_REDIRECT */
1608
1609	isa_irq = ffs(isa_mask);		/* find its bit position */
1610	if (isa_irq == 0)			/* doesn't exist */
1611		return 0;
1612	--isa_irq;				/* make it zero based */
1613
1614	apic_pin = isa_apic_irq(isa_irq);	/* look for APIC connection */
1615	if (apic_pin == -1)
1616		return 0;
1617
1618	return (1 << apic_pin);			/* convert pin# to a mask */
1619}
1620
1621
1622/*
1623 * Determine which APIC pin an ISA/EISA INT is attached to.
1624 */
1625#define INTTYPE(I)	(io_apic_ints[(I)].int_type)
1626#define INTPIN(I)	(io_apic_ints[(I)].dst_apic_int)
1627#define INTIRQ(I)	(io_apic_ints[(I)].int_vector)
1628#define INTAPIC(I)	(ID_TO_IO(io_apic_ints[(I)].dst_apic_id))
1629
1630#define SRCBUSIRQ(I)	(io_apic_ints[(I)].src_bus_irq)
1631int
1632isa_apic_irq(int isa_irq)
1633{
1634	int     intr;
1635
1636	for (intr = 0; intr < nintrs; ++intr) {		/* check each record */
1637		if (INTTYPE(intr) == 0) {		/* standard INT */
1638			if (SRCBUSIRQ(intr) == isa_irq) {
1639				if (apic_int_is_bus_type(intr, ISA) ||
1640			            apic_int_is_bus_type(intr, EISA)) {
1641					if (INTIRQ(intr) == 0xff)
1642						return -1; /* unassigned */
1643					return INTIRQ(intr);	/* found */
1644				}
1645			}
1646		}
1647	}
1648	return -1;					/* NOT found */
1649}
1650
1651
1652/*
1653 * Determine which APIC pin a PCI INT is attached to.
1654 */
1655#define SRCBUSID(I)	(io_apic_ints[(I)].src_bus_id)
1656#define SRCBUSDEVICE(I)	((io_apic_ints[(I)].src_bus_irq >> 2) & 0x1f)
1657#define SRCBUSLINE(I)	(io_apic_ints[(I)].src_bus_irq & 0x03)
1658int
1659pci_apic_irq(int pciBus, int pciDevice, int pciInt)
1660{
1661	int     intr;
1662
1663	--pciInt;					/* zero based */
1664
1665	for (intr = 0; intr < nintrs; ++intr)		/* check each record */
1666		if ((INTTYPE(intr) == 0)		/* standard INT */
1667		    && (SRCBUSID(intr) == pciBus)
1668		    && (SRCBUSDEVICE(intr) == pciDevice)
1669		    && (SRCBUSLINE(intr) == pciInt))	/* a candidate IRQ */
1670			if (apic_int_is_bus_type(intr, PCI)) {
1671				if (INTIRQ(intr) == 0xff)
1672					allocate_apic_irq(intr);
1673				if (INTIRQ(intr) == 0xff)
1674					return -1;	/* unassigned */
1675				return INTIRQ(intr);	/* exact match */
1676			}
1677
1678	return -1;					/* NOT found */
1679}
1680
1681int
1682next_apic_irq(int irq)
1683{
1684	int intr, ointr;
1685	int bus, bustype;
1686
1687	bus = 0;
1688	bustype = 0;
1689	for (intr = 0; intr < nintrs; intr++) {
1690		if (INTIRQ(intr) != irq || INTTYPE(intr) != 0)
1691			continue;
1692		bus = SRCBUSID(intr);
1693		bustype = apic_bus_type(bus);
1694		if (bustype != ISA &&
1695		    bustype != EISA &&
1696		    bustype != PCI)
1697			continue;
1698		break;
1699	}
1700	if (intr >= nintrs) {
1701		return -1;
1702	}
1703	for (ointr = intr + 1; ointr < nintrs; ointr++) {
1704		if (INTTYPE(ointr) != 0)
1705			continue;
1706		if (bus != SRCBUSID(ointr))
1707			continue;
1708		if (bustype == PCI) {
1709			if (SRCBUSDEVICE(intr) != SRCBUSDEVICE(ointr))
1710				continue;
1711			if (SRCBUSLINE(intr) != SRCBUSLINE(ointr))
1712				continue;
1713		}
1714		if (bustype == ISA || bustype == EISA) {
1715			if (SRCBUSIRQ(intr) != SRCBUSIRQ(ointr))
1716				continue;
1717		}
1718		if (INTPIN(intr) == INTPIN(ointr))
1719			continue;
1720		break;
1721	}
1722	if (ointr >= nintrs) {
1723		return -1;
1724	}
1725	return INTIRQ(ointr);
1726}
1727#undef SRCBUSLINE
1728#undef SRCBUSDEVICE
1729#undef SRCBUSID
1730#undef SRCBUSIRQ
1731
1732#undef INTPIN
1733#undef INTIRQ
1734#undef INTAPIC
1735#undef INTTYPE
1736
1737
1738/*
1739 * Reprogram the MB chipset to NOT redirect an ISA INTerrupt.
1740 *
1741 * XXX FIXME:
1742 *  Exactly what this means is unclear at this point.  It is a solution
1743 *  for motherboards that redirect the MBIRQ0 pin.  Generically a motherboard
1744 *  could route any of the ISA INTs to upper (>15) IRQ values.  But most would
1745 *  NOT be redirected via MBIRQ0, thus "undirect()ing" them would NOT be an
1746 *  option.
1747 */
1748int
1749undirect_isa_irq(int rirq)
1750{
1751#if defined(READY)
1752	if (bootverbose)
1753	    printf("Freeing redirected ISA irq %d.\n", rirq);
1754	/** FIXME: tickle the MB redirector chip */
1755	return -1;
1756#else
1757	if (bootverbose)
1758	    printf("Freeing (NOT implemented) redirected ISA irq %d.\n", rirq);
1759	return 0;
1760#endif  /* READY */
1761}
1762
1763
1764/*
1765 * Reprogram the MB chipset to NOT redirect a PCI INTerrupt
1766 */
1767int
1768undirect_pci_irq(int rirq)
1769{
1770#if defined(READY)
1771	if (bootverbose)
1772		printf("Freeing redirected PCI irq %d.\n", rirq);
1773
1774	/** FIXME: tickle the MB redirector chip */
1775	return -1;
1776#else
1777	if (bootverbose)
1778		printf("Freeing (NOT implemented) redirected PCI irq %d.\n",
1779		       rirq);
1780	return 0;
1781#endif  /* READY */
1782}
1783
1784
1785/*
1786 * given a bus ID, return:
1787 *  the bus type if found
1788 *  -1 if NOT found
1789 */
1790int
1791apic_bus_type(int id)
1792{
1793	int     x;
1794
1795	for (x = 0; x < mp_nbusses; ++x)
1796		if (bus_data[x].bus_id == id)
1797			return bus_data[x].bus_type;
1798
1799	return -1;
1800}
1801
1802
1803/*
1804 * given a LOGICAL APIC# and pin#, return:
1805 *  the associated src bus ID if found
1806 *  -1 if NOT found
1807 */
1808int
1809apic_src_bus_id(int apic, int pin)
1810{
1811	int     x;
1812
1813	/* search each of the possible INTerrupt sources */
1814	for (x = 0; x < nintrs; ++x)
1815		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1816		    (pin == io_apic_ints[x].dst_apic_int))
1817			return (io_apic_ints[x].src_bus_id);
1818
1819	return -1;		/* NOT found */
1820}
1821
1822
1823/*
1824 * given a LOGICAL APIC# and pin#, return:
1825 *  the associated src bus IRQ if found
1826 *  -1 if NOT found
1827 */
1828int
1829apic_src_bus_irq(int apic, int pin)
1830{
1831	int     x;
1832
1833	for (x = 0; x < nintrs; x++)
1834		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1835		    (pin == io_apic_ints[x].dst_apic_int))
1836			return (io_apic_ints[x].src_bus_irq);
1837
1838	return -1;		/* NOT found */
1839}
1840
1841
1842/*
1843 * given a LOGICAL APIC# and pin#, return:
1844 *  the associated INTerrupt type if found
1845 *  -1 if NOT found
1846 */
1847int
1848apic_int_type(int apic, int pin)
1849{
1850	int     x;
1851
1852	/* search each of the possible INTerrupt sources */
1853	for (x = 0; x < nintrs; ++x)
1854		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1855		    (pin == io_apic_ints[x].dst_apic_int))
1856			return (io_apic_ints[x].int_type);
1857
1858	return -1;		/* NOT found */
1859}
1860
1861int
1862apic_irq(int apic, int pin)
1863{
1864	int x;
1865	int res;
1866
1867	for (x = 0; x < nintrs; ++x)
1868		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1869		    (pin == io_apic_ints[x].dst_apic_int)) {
1870			res = io_apic_ints[x].int_vector;
1871			if (res == 0xff)
1872				return -1;
1873			if (apic != int_to_apicintpin[res].ioapic)
1874				panic("apic_irq: inconsistent table");
1875			if (pin != int_to_apicintpin[res].int_pin)
1876				panic("apic_irq inconsistent table (2)");
1877			return res;
1878		}
1879	return -1;
1880}
1881
1882
1883/*
1884 * given a LOGICAL APIC# and pin#, return:
1885 *  the associated trigger mode if found
1886 *  -1 if NOT found
1887 */
1888int
1889apic_trigger(int apic, int pin)
1890{
1891	int     x;
1892
1893	/* search each of the possible INTerrupt sources */
1894	for (x = 0; x < nintrs; ++x)
1895		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1896		    (pin == io_apic_ints[x].dst_apic_int))
1897			return ((io_apic_ints[x].int_flags >> 2) & 0x03);
1898
1899	return -1;		/* NOT found */
1900}
1901
1902
1903/*
1904 * given a LOGICAL APIC# and pin#, return:
1905 *  the associated 'active' level if found
1906 *  -1 if NOT found
1907 */
1908int
1909apic_polarity(int apic, int pin)
1910{
1911	int     x;
1912
1913	/* search each of the possible INTerrupt sources */
1914	for (x = 0; x < nintrs; ++x)
1915		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1916		    (pin == io_apic_ints[x].dst_apic_int))
1917			return (io_apic_ints[x].int_flags & 0x03);
1918
1919	return -1;		/* NOT found */
1920}
1921
1922
1923/*
1924 * set data according to MP defaults
1925 * FIXME: probably not complete yet...
1926 */
1927static void
1928default_mp_table(int type)
1929{
1930	int     ap_cpu_id;
1931#if defined(APIC_IO)
1932	int     io_apic_id;
1933	int     pin;
1934#endif	/* APIC_IO */
1935
1936#if 0
1937	printf("  MP default config type: %d\n", type);
1938	switch (type) {
1939	case 1:
1940		printf("   bus: ISA, APIC: 82489DX\n");
1941		break;
1942	case 2:
1943		printf("   bus: EISA, APIC: 82489DX\n");
1944		break;
1945	case 3:
1946		printf("   bus: EISA, APIC: 82489DX\n");
1947		break;
1948	case 4:
1949		printf("   bus: MCA, APIC: 82489DX\n");
1950		break;
1951	case 5:
1952		printf("   bus: ISA+PCI, APIC: Integrated\n");
1953		break;
1954	case 6:
1955		printf("   bus: EISA+PCI, APIC: Integrated\n");
1956		break;
1957	case 7:
1958		printf("   bus: MCA+PCI, APIC: Integrated\n");
1959		break;
1960	default:
1961		printf("   future type\n");
1962		break;
1963		/* NOTREACHED */
1964	}
1965#endif	/* 0 */
1966
1967	boot_cpu_id = (lapic.id & APIC_ID_MASK) >> 24;
1968	ap_cpu_id = (boot_cpu_id == 0) ? 1 : 0;
1969
1970	/* BSP */
1971	CPU_TO_ID(0) = boot_cpu_id;
1972	ID_TO_CPU(boot_cpu_id) = 0;
1973
1974	/* one and only AP */
1975	CPU_TO_ID(1) = ap_cpu_id;
1976	ID_TO_CPU(ap_cpu_id) = 1;
1977
1978#if defined(APIC_IO)
1979	/* one and only IO APIC */
1980	io_apic_id = (io_apic_read(0, IOAPIC_ID) & APIC_ID_MASK) >> 24;
1981
1982	/*
1983	 * sanity check, refer to MP spec section 3.6.6, last paragraph
1984	 * necessary as some hardware isn't properly setting up the IO APIC
1985	 */
1986#if defined(REALLY_ANAL_IOAPICID_VALUE)
1987	if (io_apic_id != 2) {
1988#else
1989	if ((io_apic_id == 0) || (io_apic_id == 1) || (io_apic_id == 15)) {
1990#endif	/* REALLY_ANAL_IOAPICID_VALUE */
1991		io_apic_set_id(0, 2);
1992		io_apic_id = 2;
1993	}
1994	IO_TO_ID(0) = io_apic_id;
1995	ID_TO_IO(io_apic_id) = 0;
1996#endif	/* APIC_IO */
1997
1998	/* fill out bus entries */
1999	switch (type) {
2000	case 1:
2001	case 2:
2002	case 3:
2003	case 4:
2004	case 5:
2005	case 6:
2006	case 7:
2007		bus_data[0].bus_id = default_data[type - 1][1];
2008		bus_data[0].bus_type = default_data[type - 1][2];
2009		bus_data[1].bus_id = default_data[type - 1][3];
2010		bus_data[1].bus_type = default_data[type - 1][4];
2011		break;
2012
2013	/* case 4: case 7:		   MCA NOT supported */
2014	default:		/* illegal/reserved */
2015		panic("BAD default MP config: %d", type);
2016		/* NOTREACHED */
2017	}
2018
2019#if defined(APIC_IO)
2020	/* general cases from MP v1.4, table 5-2 */
2021	for (pin = 0; pin < 16; ++pin) {
2022		io_apic_ints[pin].int_type = 0;
2023		io_apic_ints[pin].int_flags = 0x05;	/* edge/active-hi */
2024		io_apic_ints[pin].src_bus_id = 0;
2025		io_apic_ints[pin].src_bus_irq = pin;	/* IRQ2 caught below */
2026		io_apic_ints[pin].dst_apic_id = io_apic_id;
2027		io_apic_ints[pin].dst_apic_int = pin;	/* 1-to-1 */
2028	}
2029
2030	/* special cases from MP v1.4, table 5-2 */
2031	if (type == 2) {
2032		io_apic_ints[2].int_type = 0xff;	/* N/C */
2033		io_apic_ints[13].int_type = 0xff;	/* N/C */
2034#if !defined(APIC_MIXED_MODE)
2035		/** FIXME: ??? */
2036		panic("sorry, can't support type 2 default yet");
2037#endif	/* APIC_MIXED_MODE */
2038	}
2039	else
2040		io_apic_ints[2].src_bus_irq = 0;	/* ISA IRQ0 is on APIC INT 2 */
2041
2042	if (type == 7)
2043		io_apic_ints[0].int_type = 0xff;	/* N/C */
2044	else
2045		io_apic_ints[0].int_type = 3;	/* vectored 8259 */
2046#endif	/* APIC_IO */
2047}
2048
2049
2050/*
2051 * start each AP in our list
2052 */
2053static int
2054start_all_aps(u_int boot_addr)
2055{
2056	int     x, i, pg;
2057#ifndef PC98
2058	u_char  mpbiosreason;
2059#endif
2060	u_long  mpbioswarmvec;
2061	struct pcpu *pc;
2062	char *stack;
2063	uintptr_t kptbase;
2064
2065	POSTCODE(START_ALL_APS_POST);
2066
2067	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
2068
2069	/* initialize BSP's local APIC */
2070	apic_initialize();
2071	bsp_apic_ready = 1;
2072
2073	/* install the AP 1st level boot code */
2074	install_ap_tramp(boot_addr);
2075
2076
2077	/* save the current value of the warm-start vector */
2078	mpbioswarmvec = *((u_long *) WARMBOOT_OFF);
2079#ifndef PC98
2080	outb(CMOS_REG, BIOS_RESET);
2081	mpbiosreason = inb(CMOS_DATA);
2082#endif
2083
2084	/* set up temporary P==V mapping for AP boot */
2085	/* XXX this is a hack, we should boot the AP on its own stack/PTD */
2086	kptbase = (uintptr_t)(void *)KPTphys;
2087	for (x = 0; x < NKPT; x++)
2088		PTD[x] = (pd_entry_t)(PG_V | PG_RW |
2089		    ((kptbase + x * PAGE_SIZE) & PG_FRAME));
2090	invltlb();
2091
2092	/* start each AP */
2093	for (x = 1; x <= mp_naps; ++x) {
2094
2095		/* This is a bit verbose, it will go away soon.  */
2096
2097		/* first page of AP's private space */
2098		pg = x * i386_btop(sizeof(struct privatespace));
2099
2100		/* allocate a new private data page */
2101		pc = (struct pcpu *)kmem_alloc(kernel_map, PAGE_SIZE);
2102
2103		/* wire it into the private page table page */
2104		SMPpt[pg] = (pt_entry_t)(PG_V | PG_RW | vtophys(pc));
2105
2106		/* allocate and set up an idle stack data page */
2107		stack = (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE); /* XXXKSE */
2108		for (i = 0; i < KSTACK_PAGES; i++)
2109			SMPpt[pg + 1 + i] = (pt_entry_t)
2110			    (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack));
2111
2112		/* prime data page for it to use */
2113		pcpu_init(pc, x, sizeof(struct pcpu));
2114
2115		/* setup a vector to our boot code */
2116		*((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
2117		*((volatile u_short *) WARMBOOT_SEG) = (boot_addr >> 4);
2118#ifndef PC98
2119		outb(CMOS_REG, BIOS_RESET);
2120		outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
2121#endif
2122
2123		bootSTK = &SMP_prvspace[x].idlekstack[KSTACK_PAGES * PAGE_SIZE];
2124		bootAP = x;
2125
2126		/* attempt to start the Application Processor */
2127		CHECK_INIT(99);	/* setup checkpoints */
2128		if (!start_ap(x, boot_addr)) {
2129			printf("AP #%d (PHY# %d) failed!\n", x, CPU_TO_ID(x));
2130			CHECK_PRINT("trace");	/* show checkpoints */
2131			/* better panic as the AP may be running loose */
2132			printf("panic y/n? [y] ");
2133			if (cngetc() != 'n')
2134				panic("bye-bye");
2135		}
2136		CHECK_PRINT("trace");		/* show checkpoints */
2137
2138		/* record its version info */
2139		cpu_apic_versions[x] = cpu_apic_versions[0];
2140
2141		all_cpus |= (1 << x);		/* record AP in CPU map */
2142	}
2143
2144	/* build our map of 'other' CPUs */
2145	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
2146
2147	/* fill in our (BSP) APIC version */
2148	cpu_apic_versions[0] = lapic.version;
2149
2150	/* restore the warmstart vector */
2151	*(u_long *) WARMBOOT_OFF = mpbioswarmvec;
2152#ifndef PC98
2153	outb(CMOS_REG, BIOS_RESET);
2154	outb(CMOS_DATA, mpbiosreason);
2155#endif
2156
2157	/*
2158	 * Set up the idle context for the BSP.  Similar to above except
2159	 * that some was done by locore, some by pmap.c and some is implicit
2160	 * because the BSP is cpu#0 and the page is initially zero and also
2161	 * because we can refer to variables by name on the BSP..
2162	 */
2163
2164	/* Allocate and setup BSP idle stack */
2165	stack = (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE);
2166	for (i = 0; i < KSTACK_PAGES; i++)
2167		SMPpt[1 + i] = (pt_entry_t)
2168		    (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack));
2169
2170	for (x = 0; x < NKPT; x++)
2171		PTD[x] = 0;
2172
2173	/* number of APs actually started */
2174	return mp_ncpus - 1;
2175}
2176
2177/*
2178 * load the 1st level AP boot code into base memory.
2179 */
2180
2181/* targets for relocation */
2182extern void bigJump(void);
2183extern void bootCodeSeg(void);
2184extern void bootDataSeg(void);
2185extern void MPentry(void);
2186extern u_int MP_GDT;
2187extern u_int mp_gdtbase;
2188
2189static void
2190install_ap_tramp(u_int boot_addr)
2191{
2192	int     x;
2193	int     size = *(int *) ((u_long) & bootMP_size);
2194	u_char *src = (u_char *) ((u_long) bootMP);
2195	u_char *dst = (u_char *) boot_addr + KERNBASE;
2196	u_int   boot_base = (u_int) bootMP;
2197	u_int8_t *dst8;
2198	u_int16_t *dst16;
2199	u_int32_t *dst32;
2200
2201	POSTCODE(INSTALL_AP_TRAMP_POST);
2202
2203	for (x = 0; x < size; ++x)
2204		*dst++ = *src++;
2205
2206	/*
2207	 * modify addresses in code we just moved to basemem. unfortunately we
2208	 * need fairly detailed info about mpboot.s for this to work.  changes
2209	 * to mpboot.s might require changes here.
2210	 */
2211
2212	/* boot code is located in KERNEL space */
2213	dst = (u_char *) boot_addr + KERNBASE;
2214
2215	/* modify the lgdt arg */
2216	dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base));
2217	*dst32 = boot_addr + ((u_int) & MP_GDT - boot_base);
2218
2219	/* modify the ljmp target for MPentry() */
2220	dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1);
2221	*dst32 = ((u_int) MPentry - KERNBASE);
2222
2223	/* modify the target for boot code segment */
2224	dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base));
2225	dst8 = (u_int8_t *) (dst16 + 1);
2226	*dst16 = (u_int) boot_addr & 0xffff;
2227	*dst8 = ((u_int) boot_addr >> 16) & 0xff;
2228
2229	/* modify the target for boot data segment */
2230	dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base));
2231	dst8 = (u_int8_t *) (dst16 + 1);
2232	*dst16 = (u_int) boot_addr & 0xffff;
2233	*dst8 = ((u_int) boot_addr >> 16) & 0xff;
2234}
2235
2236/*
2237 * This function starts the AP (application processor) identified
2238 * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
2239 * to accomplish this.  This is necessary because of the nuances
2240 * of the different hardware we might encounter.  It isn't pretty,
2241 * but it seems to work.
2242 */
2243static int
2244start_ap(int logical_cpu, u_int boot_addr)
2245{
2246	int     physical_cpu;
2247	int     vector;
2248	int     cpus;
2249	u_long  icr_lo, icr_hi;
2250
2251	POSTCODE(START_AP_POST);
2252
2253	/* get the PHYSICAL APIC ID# */
2254	physical_cpu = CPU_TO_ID(logical_cpu);
2255
2256	/* calculate the vector */
2257	vector = (boot_addr >> 12) & 0xff;
2258
2259	/* used as a watchpoint to signal AP startup */
2260	cpus = mp_ncpus;
2261
2262	/*
2263	 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
2264	 * and running the target CPU. OR this INIT IPI might be latched (P5
2265	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
2266	 * ignored.
2267	 */
2268
2269	/* setup the address for the target AP */
2270	icr_hi = lapic.icr_hi & ~APIC_ID_MASK;
2271	icr_hi |= (physical_cpu << 24);
2272	lapic.icr_hi = icr_hi;
2273
2274	/* setup common fields for subsequent IPIs */
2275	icr_lo = lapic.icr_lo & APIC_ICRLO_RESV_MASK;
2276	icr_lo |= APIC_DESTMODE_PHY;
2277
2278	/* do an INIT IPI: assert RESET */
2279	lapic.icr_lo = icr_lo | APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
2280	    APIC_LEVEL_ASSERT | APIC_DELMODE_INIT;
2281
2282	/* wait for pending status end */
2283	while (lapic.icr_lo & APIC_DELSTAT_MASK)
2284		 /* spin */ ;
2285
2286	/* do an INIT IPI: deassert RESET */
2287	lapic.icr_lo = icr_lo | APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL |
2288	    APIC_LEVEL_DEASSERT | APIC_DELMODE_INIT;
2289
2290	/* wait for pending status end */
2291	u_sleep(10000);		/* wait ~10mS */
2292	while (lapic.icr_lo & APIC_DELSTAT_MASK)
2293		 /* spin */ ;
2294
2295	/*
2296	 * next we do a STARTUP IPI: the previous INIT IPI might still be
2297	 * latched, (P5 bug) this 1st STARTUP would then terminate
2298	 * immediately, and the previously started INIT IPI would continue. OR
2299	 * the previous INIT IPI has already run. and this STARTUP IPI will
2300	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
2301	 * will run.
2302	 */
2303
2304	/* do a STARTUP IPI */
2305	lapic.icr_lo = icr_lo | APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
2306	    APIC_LEVEL_DEASSERT | APIC_DELMODE_STARTUP | vector;
2307	while (lapic.icr_lo & APIC_DELSTAT_MASK)
2308		 /* spin */ ;
2309	u_sleep(200);		/* wait ~200uS */
2310
2311	/*
2312	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
2313	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
2314	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
2315	 * recognized after hardware RESET or INIT IPI.
2316	 */
2317
2318	lapic.icr_lo = icr_lo | APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
2319	    APIC_LEVEL_DEASSERT | APIC_DELMODE_STARTUP | vector;
2320	while (lapic.icr_lo & APIC_DELSTAT_MASK)
2321		 /* spin */ ;
2322	u_sleep(200);		/* wait ~200uS */
2323
2324	/* wait for it to start */
2325	set_apic_timer(5000000);/* == 5 seconds */
2326	while (read_apic_timer())
2327		if (mp_ncpus > cpus)
2328			return 1;	/* return SUCCESS */
2329
2330	return 0;		/* return FAILURE */
2331}
2332
2333#if defined(APIC_IO)
2334
2335#ifdef COUNT_XINVLTLB_HITS
2336u_int xhits_gbl[MAXCPU];
2337u_int xhits_pg[MAXCPU];
2338u_int xhits_rng[MAXCPU];
2339SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
2340SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
2341    sizeof(xhits_gbl), "IU", "");
2342SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
2343    sizeof(xhits_pg), "IU", "");
2344SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
2345    sizeof(xhits_rng), "IU", "");
2346
2347u_int ipi_global;
2348u_int ipi_page;
2349u_int ipi_range;
2350u_int ipi_range_size;
2351SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
2352SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
2353SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
2354SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
2355    0, "");
2356
2357u_int ipi_masked_global;
2358u_int ipi_masked_page;
2359u_int ipi_masked_range;
2360u_int ipi_masked_range_size;
2361SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
2362    &ipi_masked_global, 0, "");
2363SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
2364    &ipi_masked_page, 0, "");
2365SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
2366    &ipi_masked_range, 0, "");
2367SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
2368    &ipi_masked_range_size, 0, "");
2369#endif
2370
2371/*
2372 * Flush the TLB on all other CPU's
2373 */
2374static void
2375smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
2376{
2377	u_int ncpu;
2378	register_t eflags;
2379
2380	ncpu = mp_ncpus - 1;	/* does not shootdown self */
2381	if (ncpu < 1)
2382		return;		/* no other cpus */
2383	eflags = read_eflags();
2384	if ((eflags & PSL_I) == 0)
2385		panic("absolutely cannot call smp_ipi_shootdown with interrupts already disabled");
2386	mtx_lock_spin(&smp_tlb_mtx);
2387	smp_tlb_addr1 = addr1;
2388	smp_tlb_addr2 = addr2;
2389	atomic_store_rel_int(&smp_tlb_wait, 0);
2390	ipi_all_but_self(vector);
2391	while (smp_tlb_wait < ncpu)
2392		ia32_pause();
2393	mtx_unlock_spin(&smp_tlb_mtx);
2394}
2395
2396/*
2397 * This is about as magic as it gets.  fortune(1) has got similar code
2398 * for reversing bits in a word.  Who thinks up this stuff??
2399 *
2400 * Yes, it does appear to be consistently faster than:
2401 * while (i = ffs(m)) {
2402 *	m >>= i;
2403 *	bits++;
2404 * }
2405 * and
2406 * while (lsb = (m & -m)) {	// This is magic too
2407 * 	m &= ~lsb;		// or: m ^= lsb
2408 *	bits++;
2409 * }
2410 * Both of these latter forms do some very strange things on gcc-3.1 with
2411 * -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2.
2412 * There is probably an SSE or MMX popcnt instruction.
2413 *
2414 * I wonder if this should be in libkern?
2415 *
2416 * XXX Stop the presses!  Another one:
2417 * static __inline u_int32_t
2418 * popcnt1(u_int32_t v)
2419 * {
2420 *	v -= ((v >> 1) & 0x55555555);
2421 *	v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
2422 *	v = (v + (v >> 4)) & 0x0F0F0F0F;
2423 *	return (v * 0x01010101) >> 24;
2424 * }
2425 * The downside is that it has a multiply.  With a pentium3 with
2426 * -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use
2427 * an imull, and in that case it is faster.  In most other cases
2428 * it appears slightly slower.
2429 */
2430static __inline u_int32_t
2431popcnt(u_int32_t m)
2432{
2433
2434	m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1);
2435	m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2);
2436	m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4);
2437	m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8);
2438	m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16);
2439	return m;
2440}
2441
2442static void
2443smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
2444{
2445	int ncpu, othercpus;
2446	register_t eflags;
2447
2448	othercpus = mp_ncpus - 1;
2449	if (mask == (u_int)-1) {
2450		ncpu = othercpus;
2451		if (ncpu < 1)
2452			return;
2453	} else {
2454		mask &= ~PCPU_GET(cpumask);
2455		if (mask == 0)
2456			return;
2457		ncpu = popcnt(mask);
2458		if (ncpu > othercpus) {
2459			/* XXX this should be a panic offence */
2460			printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
2461			    ncpu, othercpus);
2462			ncpu = othercpus;
2463		}
2464		/* XXX should be a panic, implied by mask == 0 above */
2465		if (ncpu < 1)
2466			return;
2467	}
2468	eflags = read_eflags();
2469	if ((eflags & PSL_I) == 0)
2470		panic("absolutely cannot call smp_targeted_ipi_shootdown with interrupts already disabled");
2471	mtx_lock_spin(&smp_tlb_mtx);
2472	smp_tlb_addr1 = addr1;
2473	smp_tlb_addr2 = addr2;
2474	atomic_store_rel_int(&smp_tlb_wait, 0);
2475	if (mask == (u_int)-1)
2476		ipi_all_but_self(vector);
2477	else
2478		ipi_selected(mask, vector);
2479	while (smp_tlb_wait < ncpu)
2480		ia32_pause();
2481	mtx_unlock_spin(&smp_tlb_mtx);
2482}
2483#endif
2484
2485void
2486smp_invltlb(void)
2487{
2488#if defined(APIC_IO)
2489	if (smp_started) {
2490		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
2491#ifdef COUNT_XINVLTLB_HITS
2492		ipi_global++;
2493#endif
2494	}
2495#endif  /* APIC_IO */
2496}
2497
2498void
2499smp_invlpg(vm_offset_t addr)
2500{
2501#if defined(APIC_IO)
2502	if (smp_started) {
2503		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
2504#ifdef COUNT_XINVLTLB_HITS
2505		ipi_page++;
2506#endif
2507	}
2508#endif  /* APIC_IO */
2509}
2510
2511void
2512smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
2513{
2514#if defined(APIC_IO)
2515	if (smp_started) {
2516		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
2517#ifdef COUNT_XINVLTLB_HITS
2518		ipi_range++;
2519		ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
2520#endif
2521	}
2522#endif  /* APIC_IO */
2523}
2524
2525void
2526smp_masked_invltlb(u_int mask)
2527{
2528#if defined(APIC_IO)
2529	if (smp_started) {
2530		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
2531#ifdef COUNT_XINVLTLB_HITS
2532		ipi_masked_global++;
2533#endif
2534	}
2535#endif  /* APIC_IO */
2536}
2537
2538void
2539smp_masked_invlpg(u_int mask, vm_offset_t addr)
2540{
2541#if defined(APIC_IO)
2542	if (smp_started) {
2543		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
2544#ifdef COUNT_XINVLTLB_HITS
2545		ipi_masked_page++;
2546#endif
2547	}
2548#endif  /* APIC_IO */
2549}
2550
2551void
2552smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2)
2553{
2554#if defined(APIC_IO)
2555	if (smp_started) {
2556		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
2557#ifdef COUNT_XINVLTLB_HITS
2558		ipi_masked_range++;
2559		ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
2560#endif
2561	}
2562#endif  /* APIC_IO */
2563}
2564
2565
2566/*
2567 * This is called once the rest of the system is up and running and we're
2568 * ready to let the AP's out of the pen.
2569 */
2570void
2571ap_init(void)
2572{
2573	u_int	apic_id;
2574
2575	/* spin until all the AP's are ready */
2576	while (!aps_ready)
2577		ia32_pause();
2578
2579	/* BSP may have changed PTD while we were waiting */
2580	invltlb();
2581
2582#if defined(I586_CPU) && !defined(NO_F00F_HACK)
2583	lidt(&r_idt);
2584#endif
2585
2586	/* set up CPU registers and state */
2587	cpu_setregs();
2588
2589	/* set up FPU state on the AP */
2590	npxinit(__INITIAL_NPXCW__);
2591
2592	/* set up SSE registers */
2593	enable_sse();
2594
2595	/* A quick check from sanity claus */
2596	apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]);
2597	if (PCPU_GET(cpuid) != apic_id) {
2598		printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
2599		printf("SMP: apic_id = %d\n", apic_id);
2600		printf("PTD[MPPTDI] = %#jx\n", (uintmax_t)PTD[MPPTDI]);
2601		panic("cpuid mismatch! boom!!");
2602	}
2603
2604	/* Init local apic for irq's */
2605	apic_initialize();
2606
2607	/* Set memory range attributes for this CPU to match the BSP */
2608	mem_range_AP_init();
2609
2610	mtx_lock_spin(&ap_boot_mtx);
2611
2612	smp_cpus++;
2613
2614	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid));
2615	printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
2616
2617	/* Build our map of 'other' CPUs. */
2618	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
2619
2620	if (bootverbose)
2621		apic_dump("ap_init()");
2622
2623	if (smp_cpus == mp_ncpus) {
2624		/* enable IPI's, tlb shootdown, freezes etc */
2625		atomic_store_rel_int(&smp_started, 1);
2626		smp_active = 1;	 /* historic */
2627	}
2628
2629	mtx_unlock_spin(&ap_boot_mtx);
2630
2631	/* wait until all the AP's are up */
2632	while (smp_started == 0)
2633		ia32_pause();
2634
2635	/* ok, now grab sched_lock and enter the scheduler */
2636	mtx_lock_spin(&sched_lock);
2637
2638	binuptime(PCPU_PTR(switchtime));
2639	PCPU_SET(switchticks, ticks);
2640
2641	cpu_throw(NULL, choosethread());	/* doesn't return */
2642
2643	panic("scheduler returned us to %s", __func__);
2644}
2645
2646/*
2647 * For statclock, we send an IPI to all CPU's to have them call this
2648 * function.
2649 *
2650 * WARNING! unpend() will call statclock() directly and skip this
2651 * routine.
2652 */
2653void
2654forwarded_statclock(struct clockframe frame)
2655{
2656
2657	if (profprocs != 0)
2658		profclock(&frame);
2659	if (pscnt == psdiv)
2660		statclock(&frame);
2661}
2662
2663void
2664forward_statclock(void)
2665{
2666	int map;
2667
2668	CTR0(KTR_SMP, "forward_statclock");
2669
2670	if (!smp_started || cold || panicstr)
2671		return;
2672
2673	map = PCPU_GET(other_cpus) & ~(stopped_cpus|hlt_cpus_mask);
2674	if (map != 0)
2675		ipi_selected(map, IPI_STATCLOCK);
2676}
2677
2678/*
2679 * For each hardclock(), we send an IPI to all other CPU's to have them
2680 * execute this function.  It would be nice to reduce contention on
2681 * sched_lock if we could simply peek at the CPU to determine the user/kernel
2682 * state and call hardclock_process() on the CPU receiving the clock interrupt
2683 * and then just use a simple IPI to handle any ast's if needed.
2684 *
2685 * WARNING! unpend() will call hardclock_process() directly and skip this
2686 * routine.
2687 */
2688void
2689forwarded_hardclock(struct clockframe frame)
2690{
2691
2692	hardclock_process(&frame);
2693}
2694
2695void
2696forward_hardclock(void)
2697{
2698	u_int map;
2699
2700	CTR0(KTR_SMP, "forward_hardclock");
2701
2702	if (!smp_started || cold || panicstr)
2703		return;
2704
2705	map = PCPU_GET(other_cpus) & ~(stopped_cpus|hlt_cpus_mask);
2706	if (map != 0)
2707		ipi_selected(map, IPI_HARDCLOCK);
2708}
2709
2710#ifdef APIC_INTR_REORDER
2711/*
2712 *	Maintain mapping from softintr vector to isr bit in local apic.
2713 */
2714void
2715set_lapic_isrloc(int intr, int vector)
2716{
2717	if (intr < 0 || intr > 32)
2718		panic("set_apic_isrloc: bad intr argument: %d",intr);
2719	if (vector < ICU_OFFSET || vector > 255)
2720		panic("set_apic_isrloc: bad vector argument: %d",vector);
2721	apic_isrbit_location[intr].location = &lapic.isr0 + ((vector>>5)<<2);
2722	apic_isrbit_location[intr].bit = (1<<(vector & 31));
2723}
2724#endif
2725
2726/*
2727 * send an IPI to a set of cpus.
2728 */
2729void
2730ipi_selected(u_int32_t cpus, u_int ipi)
2731{
2732
2733	CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi);
2734	selected_apic_ipi(cpus, ipi, APIC_DELMODE_FIXED);
2735}
2736
2737/*
2738 * send an IPI INTerrupt containing 'vector' to all CPUs, including myself
2739 */
2740void
2741ipi_all(u_int ipi)
2742{
2743
2744	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
2745	apic_ipi(APIC_DEST_ALLISELF, ipi, APIC_DELMODE_FIXED);
2746}
2747
2748/*
2749 * send an IPI to all CPUs EXCEPT myself
2750 */
2751void
2752ipi_all_but_self(u_int ipi)
2753{
2754
2755	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
2756	apic_ipi(APIC_DEST_ALLESELF, ipi, APIC_DELMODE_FIXED);
2757}
2758
2759/*
2760 * send an IPI to myself
2761 */
2762void
2763ipi_self(u_int ipi)
2764{
2765
2766	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
2767	apic_ipi(APIC_DEST_SELF, ipi, APIC_DELMODE_FIXED);
2768}
2769
2770static void
2771release_aps(void *dummy __unused)
2772{
2773
2774	if (mp_ncpus == 1)
2775		return;
2776	mtx_lock_spin(&sched_lock);
2777	atomic_store_rel_int(&aps_ready, 1);
2778	while (smp_started == 0)
2779		ia32_pause();
2780	mtx_unlock_spin(&sched_lock);
2781}
2782SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
2783
2784static int
2785sysctl_hlt_cpus(SYSCTL_HANDLER_ARGS)
2786{
2787	u_int mask;
2788	int error;
2789
2790	mask = hlt_cpus_mask;
2791	error = sysctl_handle_int(oidp, &mask, 0, req);
2792	if (error || !req->newptr)
2793		return (error);
2794
2795	if (logical_cpus_mask != 0 &&
2796	    (mask & logical_cpus_mask) == logical_cpus_mask)
2797		hlt_logical_cpus = 1;
2798	else
2799		hlt_logical_cpus = 0;
2800
2801	if ((mask & all_cpus) == all_cpus)
2802		mask &= ~(1<<0);
2803	hlt_cpus_mask = mask;
2804	return (error);
2805}
2806SYSCTL_PROC(_machdep, OID_AUTO, hlt_cpus, CTLTYPE_INT|CTLFLAG_RW,
2807    0, 0, sysctl_hlt_cpus, "IU", "");
2808
2809static int
2810sysctl_hlt_logical_cpus(SYSCTL_HANDLER_ARGS)
2811{
2812	int disable, error;
2813
2814	disable = hlt_logical_cpus;
2815	error = sysctl_handle_int(oidp, &disable, 0, req);
2816	if (error || !req->newptr)
2817		return (error);
2818
2819	if (disable)
2820		hlt_cpus_mask |= logical_cpus_mask;
2821	else
2822		hlt_cpus_mask &= ~logical_cpus_mask;
2823
2824	if ((hlt_cpus_mask & all_cpus) == all_cpus)
2825		hlt_cpus_mask &= ~(1<<0);
2826
2827	hlt_logical_cpus = disable;
2828	return (error);
2829}
2830
2831static void
2832cpu_hlt_setup(void *dummy __unused)
2833{
2834
2835	if (logical_cpus_mask != 0) {
2836		TUNABLE_INT_FETCH("machdep.hlt_logical_cpus",
2837		    &hlt_logical_cpus);
2838		sysctl_ctx_init(&logical_cpu_clist);
2839		SYSCTL_ADD_PROC(&logical_cpu_clist,
2840		    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
2841		    "hlt_logical_cpus", CTLTYPE_INT|CTLFLAG_RW, 0, 0,
2842		    sysctl_hlt_logical_cpus, "IU", "");
2843		SYSCTL_ADD_UINT(&logical_cpu_clist,
2844		    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
2845		    "logical_cpus_mask", CTLTYPE_INT|CTLFLAG_RD,
2846		    &logical_cpus_mask, 0, "");
2847
2848		if (hlt_logical_cpus)
2849			hlt_cpus_mask |= logical_cpus_mask;
2850	}
2851}
2852SYSINIT(cpu_hlt, SI_SUB_SMP, SI_ORDER_ANY, cpu_hlt_setup, NULL);
2853
2854int
2855mp_grab_cpu_hlt(void)
2856{
2857	u_int mask = PCPU_GET(cpumask);
2858	int retval;
2859
2860	retval = mask & hlt_cpus_mask;
2861	while (mask & hlt_cpus_mask)
2862		__asm __volatile("sti; hlt" : : : "memory");
2863	return (retval);
2864}
2865