mp_machdep.c revision 120654
1/*-
2 * Copyright (c) 1996, by Steve Passe
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. The name of the developer may NOT be used to endorse or promote products
11 *    derived from this software without specific prior written permission.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26#include <sys/cdefs.h>
27__FBSDID("$FreeBSD: head/sys/i386/i386/mp_machdep.c 120654 2003-10-01 23:46:08Z peter $");
28
29#include "opt_cpu.h"
30#include "opt_kstack_pages.h"
31
32#ifdef SMP
33#include <machine/smptests.h>
34#else
35#if !defined(lint)
36#error
37#endif
38#endif
39
40#include <sys/param.h>
41#include <sys/systm.h>
42#include <sys/bus.h>
43#include <sys/cons.h>	/* cngetc() */
44#ifdef GPROF
45#include <sys/gmon.h>
46#endif
47#include <sys/kernel.h>
48#include <sys/ktr.h>
49#include <sys/lock.h>
50#include <sys/malloc.h>
51#include <sys/memrange.h>
52#include <sys/mutex.h>
53#include <sys/pcpu.h>
54#include <sys/proc.h>
55#include <sys/smp.h>
56#include <sys/sysctl.h>
57#include <sys/user.h>
58
59#include <vm/vm.h>
60#include <vm/vm_param.h>
61#include <vm/pmap.h>
62#include <vm/vm_kern.h>
63#include <vm/vm_extern.h>
64#include <vm/vm_map.h>
65
66#include <machine/apic.h>
67#include <machine/atomic.h>
68#include <machine/clock.h>
69#include <machine/cpu.h>
70#include <machine/cpufunc.h>
71#include <machine/mpapic.h>
72#include <machine/psl.h>
73#include <machine/segments.h>
74#include <machine/smp.h>
75#include <machine/smptests.h>	/** TEST_DEFAULT_CONFIG, TEST_TEST1 */
76#include <machine/tss.h>
77#include <machine/specialreg.h>
78#include <machine/privatespace.h>
79
80#if defined(APIC_IO)
81#include <machine/md_var.h>		/* setidt() */
82#include <i386/isa/icu.h>		/* IPIs */
83#include <i386/isa/intr_machdep.h>	/* IPIs */
84#endif	/* APIC_IO */
85
86#if defined(TEST_DEFAULT_CONFIG)
87#define MPFPS_MPFB1	TEST_DEFAULT_CONFIG
88#else
89#define MPFPS_MPFB1	mpfps->mpfb1
90#endif  /* TEST_DEFAULT_CONFIG */
91
92#define WARMBOOT_TARGET		0
93#define WARMBOOT_OFF		(KERNBASE + 0x0467)
94#define WARMBOOT_SEG		(KERNBASE + 0x0469)
95
96#ifdef PC98
97#define BIOS_BASE		(0xe8000)
98#define BIOS_SIZE		(0x18000)
99#else
100#define BIOS_BASE		(0xf0000)
101#define BIOS_SIZE		(0x10000)
102#endif
103#define BIOS_COUNT		(BIOS_SIZE/4)
104
105#define CMOS_REG		(0x70)
106#define CMOS_DATA		(0x71)
107#define BIOS_RESET		(0x0f)
108#define BIOS_WARM		(0x0a)
109
110#define PROCENTRY_FLAG_EN	0x01
111#define PROCENTRY_FLAG_BP	0x02
112#define IOAPICENTRY_FLAG_EN	0x01
113
114
115/* MP Floating Pointer Structure */
116typedef struct MPFPS {
117	char    signature[4];
118	void   *pap;
119	u_char  length;
120	u_char  spec_rev;
121	u_char  checksum;
122	u_char  mpfb1;
123	u_char  mpfb2;
124	u_char  mpfb3;
125	u_char  mpfb4;
126	u_char  mpfb5;
127}      *mpfps_t;
128
129/* MP Configuration Table Header */
130typedef struct MPCTH {
131	char    signature[4];
132	u_short base_table_length;
133	u_char  spec_rev;
134	u_char  checksum;
135	u_char  oem_id[8];
136	u_char  product_id[12];
137	void   *oem_table_pointer;
138	u_short oem_table_size;
139	u_short entry_count;
140	void   *apic_address;
141	u_short extended_table_length;
142	u_char  extended_table_checksum;
143	u_char  reserved;
144}      *mpcth_t;
145
146
147typedef struct PROCENTRY {
148	u_char  type;
149	u_char  apic_id;
150	u_char  apic_version;
151	u_char  cpu_flags;
152	u_long  cpu_signature;
153	u_long  feature_flags;
154	u_long  reserved1;
155	u_long  reserved2;
156}      *proc_entry_ptr;
157
158typedef struct BUSENTRY {
159	u_char  type;
160	u_char  bus_id;
161	char    bus_type[6];
162}      *bus_entry_ptr;
163
164typedef struct IOAPICENTRY {
165	u_char  type;
166	u_char  apic_id;
167	u_char  apic_version;
168	u_char  apic_flags;
169	void   *apic_address;
170}      *io_apic_entry_ptr;
171
172typedef struct INTENTRY {
173	u_char  type;
174	u_char  int_type;
175	u_short int_flags;
176	u_char  src_bus_id;
177	u_char  src_bus_irq;
178	u_char  dst_apic_id;
179	u_char  dst_apic_int;
180}      *int_entry_ptr;
181
182/* descriptions of MP basetable entries */
183typedef struct BASETABLE_ENTRY {
184	u_char  type;
185	u_char  length;
186	char    name[16];
187}       basetable_entry;
188
189/*
190 * this code MUST be enabled here and in mpboot.s.
191 * it follows the very early stages of AP boot by placing values in CMOS ram.
192 * it NORMALLY will never be needed and thus the primitive method for enabling.
193 *
194#define CHECK_POINTS
195 */
196
197#if defined(CHECK_POINTS) && !defined(PC98)
198#define CHECK_READ(A)	 (outb(CMOS_REG, (A)), inb(CMOS_DATA))
199#define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D)))
200
201#define CHECK_INIT(D);				\
202	CHECK_WRITE(0x34, (D));			\
203	CHECK_WRITE(0x35, (D));			\
204	CHECK_WRITE(0x36, (D));			\
205	CHECK_WRITE(0x37, (D));			\
206	CHECK_WRITE(0x38, (D));			\
207	CHECK_WRITE(0x39, (D));
208
209#define CHECK_PRINT(S);				\
210	printf("%s: %d, %d, %d, %d, %d, %d\n",	\
211	   (S),					\
212	   CHECK_READ(0x34),			\
213	   CHECK_READ(0x35),			\
214	   CHECK_READ(0x36),			\
215	   CHECK_READ(0x37),			\
216	   CHECK_READ(0x38),			\
217	   CHECK_READ(0x39));
218
219#else				/* CHECK_POINTS */
220
221#define CHECK_INIT(D)
222#define CHECK_PRINT(S)
223
224#endif				/* CHECK_POINTS */
225
226/*
227 * Values to send to the POST hardware.
228 */
229#define MP_BOOTADDRESS_POST	0x10
230#define MP_PROBE_POST		0x11
231#define MPTABLE_PASS1_POST	0x12
232
233#define MP_START_POST		0x13
234#define MP_ENABLE_POST		0x14
235#define MPTABLE_PASS2_POST	0x15
236
237#define START_ALL_APS_POST	0x16
238#define INSTALL_AP_TRAMP_POST	0x17
239#define START_AP_POST		0x18
240
241#define MP_ANNOUNCE_POST	0x19
242
243static int need_hyperthreading_fixup;
244static u_int logical_cpus;
245static u_int logical_cpus_mask;
246
247/* used to hold the AP's until we are ready to release them */
248static struct mtx ap_boot_mtx;
249
250/** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */
251int	current_postcode;
252
253/** XXX FIXME: what system files declare these??? */
254extern struct region_descriptor r_gdt, r_idt;
255
256int	bsp_apic_ready = 0;	/* flags useability of BSP apic */
257int	mp_naps;		/* # of Applications processors */
258int	mp_nbusses;		/* # of busses */
259int	mp_napics;		/* # of IO APICs */
260int	boot_cpu_id;		/* designated BSP */
261vm_offset_t cpu_apic_address;
262vm_offset_t io_apic_address[NAPICID];	/* NAPICID is more than enough */
263extern	int nkpt;
264
265u_int32_t cpu_apic_versions[MAXCPU];
266u_int32_t *io_apic_versions;
267
268#ifdef APIC_INTR_REORDER
269struct {
270	volatile int *location;
271	int bit;
272} apic_isrbit_location[32];
273#endif
274
275struct apic_intmapinfo	int_to_apicintpin[APIC_INTMAPSIZE];
276
277/*
278 * APIC ID logical/physical mapping structures.
279 * We oversize these to simplify boot-time config.
280 */
281int     cpu_num_to_apic_id[NAPICID];
282int     io_num_to_apic_id[NAPICID];
283int     apic_id_to_logical[NAPICID];
284
285/*
286 * CPU topology map datastructures for HTT.
287 */
288struct	cpu_group	mp_groups[NAPICID];
289struct	cpu_top mp_top;
290struct	cpu_top *smp_topology;
291
292
293/* AP uses this during bootstrap.  Do not staticize.  */
294char *bootSTK;
295static int bootAP;
296
297/* Hotwire a 0->4MB V==P mapping */
298extern pt_entry_t *KPTphys;
299
300/* SMP page table page */
301extern pt_entry_t *SMPpt;
302
303struct pcb stoppcbs[MAXCPU];
304
305#ifdef APIC_IO
306/* Variables needed for SMP tlb shootdown. */
307vm_offset_t smp_tlb_addr1;
308vm_offset_t smp_tlb_addr2;
309volatile int smp_tlb_wait;
310static struct mtx smp_tlb_mtx;
311#endif
312
313/*
314 * Local data and functions.
315 */
316
317/* Set to 1 once we're ready to let the APs out of the pen. */
318static volatile int aps_ready = 0;
319
320static int	mp_capable;
321static u_int	boot_address;
322static u_int	base_memory;
323
324static int	picmode;		/* 0: virtual wire mode, 1: PIC mode */
325static mpfps_t	mpfps;
326static int	search_for_sig(u_int32_t target, int count);
327static void	mp_enable(u_int boot_addr);
328
329static void	mptable_hyperthread_fixup(u_int id_mask);
330static void	mptable_pass1(void);
331static int	mptable_pass2(void);
332static void	default_mp_table(int type);
333static void	fix_mp_table(void);
334static void	setup_apic_irq_mapping(void);
335static void	init_locks(void);
336static int	start_all_aps(u_int boot_addr);
337static void	install_ap_tramp(u_int boot_addr);
338static int	start_ap(int logicalCpu, u_int boot_addr);
339void		ap_init(void);
340static int	apic_int_is_bus_type(int intr, int bus_type);
341static void	release_aps(void *dummy);
342
343static int	hlt_cpus_mask;
344static int	hlt_logical_cpus = 1;
345static struct	sysctl_ctx_list logical_cpu_clist;
346
347/*
348 * initialize all the SMP locks
349 */
350
351/* lock region used by kernel profiling */
352int	mcount_lock;
353
354#ifdef USE_COMLOCK
355/* locks com (tty) data/hardware accesses: a FASTINTR() */
356struct mtx		com_mtx;
357#endif /* USE_COMLOCK */
358
359static void
360init_locks(void)
361{
362
363#ifdef USE_COMLOCK
364	mtx_init(&com_mtx, "com", NULL, MTX_SPIN);
365#endif /* USE_COMLOCK */
366#ifdef APIC_IO
367	mtx_init(&smp_tlb_mtx, "tlb", NULL, MTX_SPIN);
368#endif
369}
370
371/*
372 * Calculate usable address in base memory for AP trampoline code.
373 */
374u_int
375mp_bootaddress(u_int basemem)
376{
377	POSTCODE(MP_BOOTADDRESS_POST);
378
379	base_memory = basemem * 1024;	/* convert to bytes */
380
381	boot_address = base_memory & ~0xfff;	/* round down to 4k boundary */
382	if ((base_memory - boot_address) < bootMP_size)
383		boot_address -= 4096;	/* not enough, lower by 4k */
384
385	return boot_address;
386}
387
388
389/*
390 * Look for an Intel MP spec table (ie, SMP capable hardware).
391 */
392void
393i386_mp_probe(void)
394{
395	int     x;
396	u_long  segment;
397	u_int32_t target;
398
399	POSTCODE(MP_PROBE_POST);
400
401	/* see if EBDA exists */
402	if ((segment = (u_long) * (u_short *) (KERNBASE + 0x40e)) != 0) {
403		/* search first 1K of EBDA */
404		target = (u_int32_t) (segment << 4);
405		if ((x = search_for_sig(target, 1024 / 4)) >= 0)
406			goto found;
407	} else {
408		/* last 1K of base memory, effective 'top of base' passed in */
409		target = (u_int32_t) (base_memory - 0x400);
410		if ((x = search_for_sig(target, 1024 / 4)) >= 0)
411			goto found;
412	}
413
414	/* search the BIOS */
415	target = (u_int32_t) BIOS_BASE;
416	if ((x = search_for_sig(target, BIOS_COUNT)) >= 0)
417		goto found;
418
419	/* nothing found */
420	mpfps = (mpfps_t)0;
421	mp_capable = 0;
422	return;
423
424found:
425	/* calculate needed resources */
426	mpfps = (mpfps_t)x;
427	mptable_pass1();
428
429	/* flag fact that we are running multiple processors */
430	mp_capable = 1;
431}
432
433int
434cpu_mp_probe(void)
435{
436	/*
437	 * Record BSP in CPU map
438	 * This is done here so that MBUF init code works correctly.
439	 */
440	all_cpus = 1;
441
442	return (mp_capable);
443}
444
445/*
446 * Initialize the SMP hardware and the APIC and start up the AP's.
447 */
448void
449cpu_mp_start(void)
450{
451	POSTCODE(MP_START_POST);
452
453	/* look for MP capable motherboard */
454	if (mp_capable)
455		mp_enable(boot_address);
456	else
457		panic("MP hardware not found!");
458
459	cpu_setregs();
460}
461
462
463/*
464 * Print various information about the SMP system hardware and setup.
465 */
466void
467cpu_mp_announce(void)
468{
469	int     x;
470
471	POSTCODE(MP_ANNOUNCE_POST);
472
473	printf(" cpu0 (BSP): apic id: %2d", CPU_TO_ID(0));
474	printf(", version: 0x%08x", cpu_apic_versions[0]);
475	printf(", at 0x%08x\n", cpu_apic_address);
476	for (x = 1; x <= mp_naps; ++x) {
477		printf(" cpu%d (AP):  apic id: %2d", x, CPU_TO_ID(x));
478		printf(", version: 0x%08x", cpu_apic_versions[x]);
479		printf(", at 0x%08x\n", cpu_apic_address);
480	}
481
482#if defined(APIC_IO)
483	for (x = 0; x < mp_napics; ++x) {
484		printf(" io%d (APIC): apic id: %2d", x, IO_TO_ID(x));
485		printf(", version: 0x%08x", io_apic_versions[x]);
486		printf(", at 0x%08x\n", io_apic_address[x]);
487	}
488#else
489	printf(" Warning: APIC I/O disabled\n");
490#endif	/* APIC_IO */
491}
492
493/*
494 * AP cpu's call this to sync up protected mode.
495 */
496void
497init_secondary(void)
498{
499	int	gsel_tss;
500	int	x, myid = bootAP;
501	u_int	cr0;
502
503	gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[myid];
504	gdt_segs[GPROC0_SEL].ssd_base =
505		(int) &SMP_prvspace[myid].pcpu.pc_common_tss;
506	SMP_prvspace[myid].pcpu.pc_prvspace =
507		&SMP_prvspace[myid].pcpu;
508
509	for (x = 0; x < NGDT; x++) {
510		ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd);
511	}
512
513	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
514	r_gdt.rd_base = (int) &gdt[myid * NGDT];
515	lgdt(&r_gdt);			/* does magic intra-segment return */
516
517	lidt(&r_idt);
518
519	lldt(_default_ldt);
520	PCPU_SET(currentldt, _default_ldt);
521
522	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
523	gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
524	PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */
525	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
526	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
527	PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd);
528	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
529	ltr(gsel_tss);
530
531	/*
532	 * Set to a known state:
533	 * Set by mpboot.s: CR0_PG, CR0_PE
534	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
535	 */
536	cr0 = rcr0();
537	cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
538	load_cr0(cr0);
539}
540
541
542#if defined(APIC_IO)
543/*
544 * Final configuration of the BSP's local APIC:
545 *  - disable 'pic mode'.
546 *  - disable 'virtual wire mode'.
547 *  - enable NMI.
548 */
549void
550bsp_apic_configure(void)
551{
552	u_char		byte;
553	u_int32_t	temp;
554
555	/* leave 'pic mode' if necessary */
556	if (picmode) {
557		outb(0x22, 0x70);	/* select IMCR */
558		byte = inb(0x23);	/* current contents */
559		byte |= 0x01;		/* mask external INTR */
560		outb(0x23, byte);	/* disconnect 8259s/NMI */
561	}
562
563	/* mask lint0 (the 8259 'virtual wire' connection) */
564	temp = lapic.lvt_lint0;
565	temp |= APIC_LVT_M;		/* set the mask */
566	lapic.lvt_lint0 = temp;
567
568        /* setup lint1 to handle NMI */
569        temp = lapic.lvt_lint1;
570        temp &= ~APIC_LVT_M;		/* clear the mask */
571        lapic.lvt_lint1 = temp;
572
573	if (bootverbose)
574		apic_dump("bsp_apic_configure()");
575}
576#endif  /* APIC_IO */
577
578
579/*******************************************************************
580 * local functions and data
581 */
582
583/*
584 * start the SMP system
585 */
586static void
587mp_enable(u_int boot_addr)
588{
589	int     x;
590#if defined(APIC_IO)
591	int     apic;
592	u_int   ux;
593#endif	/* APIC_IO */
594
595	POSTCODE(MP_ENABLE_POST);
596
597	/* turn on 4MB of V == P addressing so we can get to MP table */
598	*(int *)PTD = PG_V | PG_RW | ((uintptr_t)(void *)KPTphys & PG_FRAME);
599	invltlb();
600
601	/* examine the MP table for needed info, uses physical addresses */
602	x = mptable_pass2();
603
604	*(int *)PTD = 0;
605	invltlb();
606
607	/* can't process default configs till the CPU APIC is pmapped */
608	if (x)
609		default_mp_table(x);
610
611	/* post scan cleanup */
612	fix_mp_table();
613	setup_apic_irq_mapping();
614
615#if defined(APIC_IO)
616
617	/* fill the LOGICAL io_apic_versions table */
618	for (apic = 0; apic < mp_napics; ++apic) {
619		ux = io_apic_read(apic, IOAPIC_VER);
620		io_apic_versions[apic] = ux;
621		io_apic_set_id(apic, IO_TO_ID(apic));
622	}
623
624	/* program each IO APIC in the system */
625	for (apic = 0; apic < mp_napics; ++apic)
626		if (io_apic_setup(apic) < 0)
627			panic("IO APIC setup failure");
628
629	/* install a 'Spurious INTerrupt' vector */
630	setidt(XSPURIOUSINT_OFFSET, Xspuriousint,
631	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
632
633	/* install an inter-CPU IPI for TLB invalidation */
634	setidt(XINVLTLB_OFFSET, Xinvltlb,
635	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
636	setidt(XINVLPG_OFFSET, Xinvlpg,
637	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
638	setidt(XINVLRNG_OFFSET, Xinvlrng,
639	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
640
641	/* install an inter-CPU IPI for forwarding hardclock() */
642	setidt(XHARDCLOCK_OFFSET, Xhardclock,
643	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
644
645	/* install an inter-CPU IPI for forwarding statclock() */
646	setidt(XSTATCLOCK_OFFSET, Xstatclock,
647	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
648
649	/* install an inter-CPU IPI for lazy pmap release */
650	setidt(XLAZYPMAP_OFFSET, Xlazypmap,
651	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
652
653	/* install an inter-CPU IPI for all-CPU rendezvous */
654	setidt(XRENDEZVOUS_OFFSET, Xrendezvous,
655	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
656
657	/* install an inter-CPU IPI for forcing an additional software trap */
658	setidt(XCPUAST_OFFSET, Xcpuast,
659	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
660
661	/* install an inter-CPU IPI for CPU stop/restart */
662	setidt(XCPUSTOP_OFFSET, Xcpustop,
663	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
664
665#if defined(TEST_TEST1)
666	/* install a "fake hardware INTerrupt" vector */
667	setidt(XTEST1_OFFSET, Xtest1,
668	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
669#endif  /** TEST_TEST1 */
670
671#endif	/* APIC_IO */
672
673	/* initialize all SMP locks */
674	init_locks();
675
676	/* start each Application Processor */
677	start_all_aps(boot_addr);
678}
679
680
681/*
682 * look for the MP spec signature
683 */
684
685/* string defined by the Intel MP Spec as identifying the MP table */
686#define MP_SIG		0x5f504d5f	/* _MP_ */
687#define NEXT(X)		((X) += 4)
688static int
689search_for_sig(u_int32_t target, int count)
690{
691	int     x;
692	u_int32_t *addr = (u_int32_t *) (KERNBASE + target);
693
694	for (x = 0; x < count; NEXT(x))
695		if (addr[x] == MP_SIG)
696			/* make array index a byte index */
697			return (target + (x * sizeof(u_int32_t)));
698
699	return -1;
700}
701
702
703static basetable_entry basetable_entry_types[] =
704{
705	{0, 20, "Processor"},
706	{1, 8, "Bus"},
707	{2, 8, "I/O APIC"},
708	{3, 8, "I/O INT"},
709	{4, 8, "Local INT"}
710};
711
712typedef struct BUSDATA {
713	u_char  bus_id;
714	enum busTypes bus_type;
715}       bus_datum;
716
717typedef struct INTDATA {
718	u_char  int_type;
719	u_short int_flags;
720	u_char  src_bus_id;
721	u_char  src_bus_irq;
722	u_char  dst_apic_id;
723	u_char  dst_apic_int;
724	u_char	int_vector;
725}       io_int, local_int;
726
727typedef struct BUSTYPENAME {
728	u_char  type;
729	char    name[7];
730}       bus_type_name;
731
732static bus_type_name bus_type_table[] =
733{
734	{CBUS, "CBUS"},
735	{CBUSII, "CBUSII"},
736	{EISA, "EISA"},
737	{MCA, "MCA"},
738	{UNKNOWN_BUSTYPE, "---"},
739	{ISA, "ISA"},
740	{MCA, "MCA"},
741	{UNKNOWN_BUSTYPE, "---"},
742	{UNKNOWN_BUSTYPE, "---"},
743	{UNKNOWN_BUSTYPE, "---"},
744	{UNKNOWN_BUSTYPE, "---"},
745	{UNKNOWN_BUSTYPE, "---"},
746	{PCI, "PCI"},
747	{UNKNOWN_BUSTYPE, "---"},
748	{UNKNOWN_BUSTYPE, "---"},
749	{UNKNOWN_BUSTYPE, "---"},
750	{UNKNOWN_BUSTYPE, "---"},
751	{XPRESS, "XPRESS"},
752	{UNKNOWN_BUSTYPE, "---"}
753};
754/* from MP spec v1.4, table 5-1 */
755static int default_data[7][5] =
756{
757/*   nbus, id0, type0, id1, type1 */
758	{1, 0, ISA, 255, 255},
759	{1, 0, EISA, 255, 255},
760	{1, 0, EISA, 255, 255},
761	{1, 0, MCA, 255, 255},
762	{2, 0, ISA, 1, PCI},
763	{2, 0, EISA, 1, PCI},
764	{2, 0, MCA, 1, PCI}
765};
766
767
768/* the bus data */
769static bus_datum *bus_data;
770
771/* the IO INT data, one entry per possible APIC INTerrupt */
772static io_int  *io_apic_ints;
773
774static int nintrs;
775
776static int processor_entry(proc_entry_ptr entry, int cpu);
777static int bus_entry(bus_entry_ptr entry, int bus);
778static int io_apic_entry(io_apic_entry_ptr entry, int apic);
779static int int_entry(int_entry_ptr entry, int intr);
780static int lookup_bus_type(char *name);
781
782
783/*
784 * 1st pass on motherboard's Intel MP specification table.
785 *
786 * initializes:
787 *	mp_ncpus = 1
788 *
789 * determines:
790 *	cpu_apic_address (common to all CPUs)
791 *	io_apic_address[N]
792 *	mp_naps
793 *	mp_nbusses
794 *	mp_napics
795 *	nintrs
796 */
797static void
798mptable_pass1(void)
799{
800	int	x;
801	mpcth_t	cth;
802	int	totalSize;
803	void*	position;
804	int	count;
805	int	type;
806	u_int	id_mask;
807
808	POSTCODE(MPTABLE_PASS1_POST);
809
810	/* clear various tables */
811	for (x = 0; x < NAPICID; ++x) {
812		io_apic_address[x] = ~0;	/* IO APIC address table */
813	}
814
815	/* init everything to empty */
816	mp_naps = 0;
817	mp_nbusses = 0;
818	mp_napics = 0;
819	nintrs = 0;
820	id_mask = 0;
821
822	/* check for use of 'default' configuration */
823	if (MPFPS_MPFB1 != 0) {
824		/* use default addresses */
825		cpu_apic_address = DEFAULT_APIC_BASE;
826		io_apic_address[0] = DEFAULT_IO_APIC_BASE;
827
828		/* fill in with defaults */
829		mp_naps = 2;		/* includes BSP */
830		mp_maxid = 1;
831		mp_nbusses = default_data[MPFPS_MPFB1 - 1][0];
832#if defined(APIC_IO)
833		mp_napics = 1;
834		nintrs = 16;
835#endif	/* APIC_IO */
836	}
837	else {
838		if ((cth = mpfps->pap) == 0)
839			panic("MP Configuration Table Header MISSING!");
840
841		cpu_apic_address = (vm_offset_t) cth->apic_address;
842
843		/* walk the table, recording info of interest */
844		totalSize = cth->base_table_length - sizeof(struct MPCTH);
845		position = (u_char *) cth + sizeof(struct MPCTH);
846		count = cth->entry_count;
847
848		while (count--) {
849			switch (type = *(u_char *) position) {
850			case 0: /* processor_entry */
851				if (((proc_entry_ptr)position)->cpu_flags
852				    & PROCENTRY_FLAG_EN) {
853					++mp_naps;
854					mp_maxid++;
855					id_mask |= 1 <<
856					    ((proc_entry_ptr)position)->apic_id;
857				}
858				break;
859			case 1: /* bus_entry */
860				++mp_nbusses;
861				break;
862			case 2: /* io_apic_entry */
863				if (((io_apic_entry_ptr)position)->apic_flags
864					& IOAPICENTRY_FLAG_EN)
865					io_apic_address[mp_napics++] =
866					    (vm_offset_t)((io_apic_entry_ptr)
867						position)->apic_address;
868				break;
869			case 3: /* int_entry */
870				++nintrs;
871				break;
872			case 4:	/* int_entry */
873				break;
874			default:
875				panic("mpfps Base Table HOSED!");
876				/* NOTREACHED */
877			}
878
879			totalSize -= basetable_entry_types[type].length;
880			(u_char*)position += basetable_entry_types[type].length;
881		}
882	}
883
884	/* qualify the numbers */
885	if (mp_naps > MAXCPU) {
886		printf("Warning: only using %d of %d available CPUs!\n",
887			MAXCPU, mp_naps);
888		mp_naps = MAXCPU;
889	}
890
891	/* See if we need to fixup HT logical CPUs. */
892	mptable_hyperthread_fixup(id_mask);
893
894	/*
895	 * Count the BSP.
896	 * This is also used as a counter while starting the APs.
897	 */
898	mp_ncpus = 1;
899
900	--mp_naps;	/* subtract the BSP */
901}
902
903
904/*
905 * 2nd pass on motherboard's Intel MP specification table.
906 *
907 * sets:
908 *	boot_cpu_id
909 *	ID_TO_IO(N), phy APIC ID to log CPU/IO table
910 *	CPU_TO_ID(N), logical CPU to APIC ID table
911 *	IO_TO_ID(N), logical IO to APIC ID table
912 *	bus_data[N]
913 *	io_apic_ints[N]
914 */
915static int
916mptable_pass2(void)
917{
918	struct PROCENTRY proc;
919	int     x;
920	mpcth_t cth;
921	int     totalSize;
922	void*   position;
923	int     count;
924	int     type;
925	int     apic, bus, cpu, intr;
926	int	i, j;
927
928	POSTCODE(MPTABLE_PASS2_POST);
929
930	/* Initialize fake proc entry for use with HT fixup. */
931	bzero(&proc, sizeof(proc));
932	proc.type = 0;
933	proc.cpu_flags = PROCENTRY_FLAG_EN;
934
935	MALLOC(io_apic_versions, u_int32_t *, sizeof(u_int32_t) * mp_napics,
936	    M_DEVBUF, M_WAITOK);
937	MALLOC(ioapic, volatile ioapic_t **, sizeof(ioapic_t *) * mp_napics,
938	    M_DEVBUF, M_WAITOK);
939	MALLOC(io_apic_ints, io_int *, sizeof(io_int) * (nintrs + 1),
940	    M_DEVBUF, M_WAITOK);
941	MALLOC(bus_data, bus_datum *, sizeof(bus_datum) * mp_nbusses,
942	    M_DEVBUF, M_WAITOK);
943
944	bzero(ioapic, sizeof(ioapic_t *) * mp_napics);
945
946	for (i = 0; i < mp_napics; i++) {
947		for (j = 0; j < mp_napics; j++) {
948			/* same page frame as a previous IO apic? */
949			if (((vm_offset_t)SMPpt[NPTEPG-2-j] & PG_FRAME) ==
950			    (io_apic_address[i] & PG_FRAME)) {
951				ioapic[i] = (ioapic_t *)((u_int)SMP_prvspace
952					+ (NPTEPG-2-j) * PAGE_SIZE
953					+ (io_apic_address[i] & PAGE_MASK));
954				break;
955			}
956			/* use this slot if available */
957			if (((vm_offset_t)SMPpt[NPTEPG-2-j] & PG_FRAME) == 0) {
958				SMPpt[NPTEPG-2-j] = (pt_entry_t)(PG_V | PG_RW |
959				    (io_apic_address[i] & PG_FRAME));
960				ioapic[i] = (ioapic_t *)((u_int)SMP_prvspace
961					+ (NPTEPG-2-j) * PAGE_SIZE
962					+ (io_apic_address[i] & PAGE_MASK));
963				break;
964			}
965		}
966	}
967
968	/* clear various tables */
969	for (x = 0; x < NAPICID; ++x) {
970		ID_TO_IO(x) = -1;	/* phy APIC ID to log CPU/IO table */
971		CPU_TO_ID(x) = -1;	/* logical CPU to APIC ID table */
972		IO_TO_ID(x) = -1;	/* logical IO to APIC ID table */
973	}
974
975	/* clear bus data table */
976	for (x = 0; x < mp_nbusses; ++x)
977		bus_data[x].bus_id = 0xff;
978
979	/* clear IO APIC INT table */
980	for (x = 0; x < (nintrs + 1); ++x) {
981		io_apic_ints[x].int_type = 0xff;
982		io_apic_ints[x].int_vector = 0xff;
983	}
984
985	/* setup the cpu/apic mapping arrays */
986	boot_cpu_id = -1;
987
988	/* record whether PIC or virtual-wire mode */
989	picmode = (mpfps->mpfb2 & 0x80) ? 1 : 0;
990
991	/* check for use of 'default' configuration */
992	if (MPFPS_MPFB1 != 0)
993		return MPFPS_MPFB1;	/* return default configuration type */
994
995	if ((cth = mpfps->pap) == 0)
996		panic("MP Configuration Table Header MISSING!");
997
998	/* walk the table, recording info of interest */
999	totalSize = cth->base_table_length - sizeof(struct MPCTH);
1000	position = (u_char *) cth + sizeof(struct MPCTH);
1001	count = cth->entry_count;
1002	apic = bus = intr = 0;
1003	cpu = 1;				/* pre-count the BSP */
1004
1005	while (count--) {
1006		switch (type = *(u_char *) position) {
1007		case 0:
1008			if (processor_entry(position, cpu)) {
1009				if (logical_cpus != 0 &&
1010				    cpu % logical_cpus != 0)
1011					logical_cpus_mask |= (1 << cpu);
1012				++cpu;
1013			}
1014			if (need_hyperthreading_fixup) {
1015				/*
1016				 * Create fake mptable processor entries
1017				 * and feed them to processor_entry() to
1018				 * enumerate the logical CPUs.
1019				 */
1020				proc.apic_id = ((proc_entry_ptr)position)->apic_id;
1021				for (i = 1; i < logical_cpus; i++) {
1022					proc.apic_id++;
1023					(void)processor_entry(&proc, cpu);
1024					logical_cpus_mask |= (1 << cpu);
1025					cpu++;
1026				}
1027			}
1028			break;
1029		case 1:
1030			if (bus_entry(position, bus))
1031				++bus;
1032			break;
1033		case 2:
1034			if (io_apic_entry(position, apic))
1035				++apic;
1036			break;
1037		case 3:
1038			if (int_entry(position, intr))
1039				++intr;
1040			break;
1041		case 4:
1042			/* int_entry(position); */
1043			break;
1044		default:
1045			panic("mpfps Base Table HOSED!");
1046			/* NOTREACHED */
1047		}
1048
1049		totalSize -= basetable_entry_types[type].length;
1050		(u_char *) position += basetable_entry_types[type].length;
1051	}
1052
1053	if (boot_cpu_id == -1)
1054		panic("NO BSP found!");
1055
1056	/* report fact that its NOT a default configuration */
1057	return 0;
1058}
1059
1060/*
1061 * Check if we should perform a hyperthreading "fix-up" to
1062 * enumerate any logical CPU's that aren't already listed
1063 * in the table.
1064 *
1065 * XXX: We assume that all of the physical CPUs in the
1066 * system have the same number of logical CPUs.
1067 *
1068 * XXX: We assume that APIC ID's are allocated such that
1069 * the APIC ID's for a physical processor are aligned
1070 * with the number of logical CPU's in the processor.
1071 */
1072static void
1073mptable_hyperthread_fixup(u_int id_mask)
1074{
1075	u_int i, id;
1076	int logical;
1077
1078	/* Nothing to do if there is no HTT support. */
1079	if ((cpu_feature & CPUID_HTT) == 0)
1080		return;
1081	logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
1082	if (logical_cpus <= 1)
1083		return;
1084
1085	/*
1086	 * For each APIC ID of a CPU that is set in the mask,
1087	 * scan the other candidate APIC ID's for this
1088	 * physical processor.  If any of those ID's are
1089	 * already in the table, then kill the fixup.
1090	 */
1091	for (id = 0; id <= MAXCPU; id++) {
1092		if ((id_mask & 1 << id) == 0)
1093			continue;
1094		/* First, make sure we are on a logical_cpus boundary. */
1095		if (id % logical_cpus != 0)
1096			return;
1097		for (i = id + 1; i < id + logical_cpus; i++)
1098			if ((id_mask & 1 << i) != 0)
1099				return;
1100	}
1101
1102	/*
1103	 * Ok, the ID's checked out, so enable the fixup.  We have to fixup
1104	 * mp_naps and mp_maxid right now.
1105	 */
1106	need_hyperthreading_fixup = 1;
1107	mp_maxid *= logical_cpus;
1108	mp_naps *= logical_cpus;
1109
1110	/*
1111	 * Now setup the cpu topology map.
1112	 */
1113	mp_top.ct_count = mp_naps / logical_cpus;
1114	mp_top.ct_group = mp_groups;
1115
1116	/*
1117	 * The first logical id is directly after the last valid physical id.
1118	 */
1119	logical = mp_top.ct_count + 1;
1120
1121	for (i = 0; i < mp_top.ct_count; i++) {
1122		int j;
1123
1124		mp_groups[i].cg_mask = (1 << i);
1125		for (j = 1; j < logical_cpus; j++)
1126			mp_groups[i].cg_mask |= (1 << logical++);
1127		mp_groups[i].cg_count = logical_cpus;
1128		mp_groups[i].cg_children = 0;
1129	}
1130
1131	smp_topology = &mp_top;
1132}
1133
1134void
1135assign_apic_irq(int apic, int intpin, int irq)
1136{
1137	int x;
1138
1139	if (int_to_apicintpin[irq].ioapic != -1)
1140		panic("assign_apic_irq: inconsistent table");
1141
1142	int_to_apicintpin[irq].ioapic = apic;
1143	int_to_apicintpin[irq].int_pin = intpin;
1144	int_to_apicintpin[irq].apic_address = ioapic[apic];
1145	int_to_apicintpin[irq].redirindex = IOAPIC_REDTBL + 2 * intpin;
1146
1147	for (x = 0; x < nintrs; x++) {
1148		if ((io_apic_ints[x].int_type == 0 ||
1149		     io_apic_ints[x].int_type == 3) &&
1150		    io_apic_ints[x].int_vector == 0xff &&
1151		    io_apic_ints[x].dst_apic_id == IO_TO_ID(apic) &&
1152		    io_apic_ints[x].dst_apic_int == intpin)
1153			io_apic_ints[x].int_vector = irq;
1154	}
1155}
1156
1157void
1158revoke_apic_irq(int irq)
1159{
1160	int x;
1161	int oldapic;
1162	int oldintpin;
1163
1164	if (int_to_apicintpin[irq].ioapic == -1)
1165		panic("revoke_apic_irq: inconsistent table");
1166
1167	oldapic = int_to_apicintpin[irq].ioapic;
1168	oldintpin = int_to_apicintpin[irq].int_pin;
1169
1170	int_to_apicintpin[irq].ioapic = -1;
1171	int_to_apicintpin[irq].int_pin = 0;
1172	int_to_apicintpin[irq].apic_address = NULL;
1173	int_to_apicintpin[irq].redirindex = 0;
1174
1175	for (x = 0; x < nintrs; x++) {
1176		if ((io_apic_ints[x].int_type == 0 ||
1177		     io_apic_ints[x].int_type == 3) &&
1178		    io_apic_ints[x].int_vector != 0xff &&
1179		    io_apic_ints[x].dst_apic_id == IO_TO_ID(oldapic) &&
1180		    io_apic_ints[x].dst_apic_int == oldintpin)
1181			io_apic_ints[x].int_vector = 0xff;
1182	}
1183}
1184
1185
1186static void
1187allocate_apic_irq(int intr)
1188{
1189	int apic;
1190	int intpin;
1191	int irq;
1192
1193	if (io_apic_ints[intr].int_vector != 0xff)
1194		return;		/* Interrupt handler already assigned */
1195
1196	if (io_apic_ints[intr].int_type != 0 &&
1197	    (io_apic_ints[intr].int_type != 3 ||
1198	     (io_apic_ints[intr].dst_apic_id == IO_TO_ID(0) &&
1199	      io_apic_ints[intr].dst_apic_int == 0)))
1200		return;		/* Not INT or ExtInt on != (0, 0) */
1201
1202	irq = 0;
1203	while (irq < APIC_INTMAPSIZE &&
1204	       int_to_apicintpin[irq].ioapic != -1)
1205		irq++;
1206
1207	if (irq >= APIC_INTMAPSIZE)
1208		return;		/* No free interrupt handlers */
1209
1210	apic = ID_TO_IO(io_apic_ints[intr].dst_apic_id);
1211	intpin = io_apic_ints[intr].dst_apic_int;
1212
1213	assign_apic_irq(apic, intpin, irq);
1214	io_apic_setup_intpin(apic, intpin);
1215}
1216
1217
1218static void
1219swap_apic_id(int apic, int oldid, int newid)
1220{
1221	int x;
1222	int oapic;
1223
1224
1225	if (oldid == newid)
1226		return;			/* Nothing to do */
1227
1228	printf("Changing APIC ID for IO APIC #%d from %d to %d in MP table\n",
1229	       apic, oldid, newid);
1230
1231	/* Swap physical APIC IDs in interrupt entries */
1232	for (x = 0; x < nintrs; x++) {
1233		if (io_apic_ints[x].dst_apic_id == oldid)
1234			io_apic_ints[x].dst_apic_id = newid;
1235		else if (io_apic_ints[x].dst_apic_id == newid)
1236			io_apic_ints[x].dst_apic_id = oldid;
1237	}
1238
1239	/* Swap physical APIC IDs in IO_TO_ID mappings */
1240	for (oapic = 0; oapic < mp_napics; oapic++)
1241		if (IO_TO_ID(oapic) == newid)
1242			break;
1243
1244	if (oapic < mp_napics) {
1245		printf("Changing APIC ID for IO APIC #%d from "
1246		       "%d to %d in MP table\n",
1247		       oapic, newid, oldid);
1248		IO_TO_ID(oapic) = oldid;
1249	}
1250	IO_TO_ID(apic) = newid;
1251}
1252
1253
1254static void
1255fix_id_to_io_mapping(void)
1256{
1257	int x;
1258
1259	for (x = 0; x < NAPICID; x++)
1260		ID_TO_IO(x) = -1;
1261
1262	for (x = 0; x <= mp_naps; x++)
1263		if (CPU_TO_ID(x) < NAPICID)
1264			ID_TO_IO(CPU_TO_ID(x)) = x;
1265
1266	for (x = 0; x < mp_napics; x++)
1267		if (IO_TO_ID(x) < NAPICID)
1268			ID_TO_IO(IO_TO_ID(x)) = x;
1269}
1270
1271
1272static int
1273first_free_apic_id(void)
1274{
1275	int freeid, x;
1276
1277	for (freeid = 0; freeid < NAPICID; freeid++) {
1278		for (x = 0; x <= mp_naps; x++)
1279			if (CPU_TO_ID(x) == freeid)
1280				break;
1281		if (x <= mp_naps)
1282			continue;
1283		for (x = 0; x < mp_napics; x++)
1284			if (IO_TO_ID(x) == freeid)
1285				break;
1286		if (x < mp_napics)
1287			continue;
1288		return freeid;
1289	}
1290	return freeid;
1291}
1292
1293
1294static int
1295io_apic_id_acceptable(int apic, int id)
1296{
1297	int cpu;		/* Logical CPU number */
1298	int oapic;		/* Logical IO APIC number for other IO APIC */
1299
1300	if (id >= NAPICID)
1301		return 0;	/* Out of range */
1302
1303	for (cpu = 0; cpu <= mp_naps; cpu++)
1304		if (CPU_TO_ID(cpu) == id)
1305			return 0;	/* Conflict with CPU */
1306
1307	for (oapic = 0; oapic < mp_napics && oapic < apic; oapic++)
1308		if (IO_TO_ID(oapic) == id)
1309			return 0;	/* Conflict with other APIC */
1310
1311	return 1;		/* ID is acceptable for IO APIC */
1312}
1313
1314
1315/*
1316 * parse an Intel MP specification table
1317 */
1318static void
1319fix_mp_table(void)
1320{
1321	int	x;
1322	int	id;
1323	int	bus_0 = 0;	/* Stop GCC warning */
1324	int	bus_pci = 0;	/* Stop GCC warning */
1325	int	num_pci_bus;
1326	int	apic;		/* IO APIC unit number */
1327	int     freeid;		/* Free physical APIC ID */
1328	int	physid;		/* Current physical IO APIC ID */
1329
1330	/*
1331	 * Fix mis-numbering of the PCI bus and its INT entries if the BIOS
1332	 * did it wrong.  The MP spec says that when more than 1 PCI bus
1333	 * exists the BIOS must begin with bus entries for the PCI bus and use
1334	 * actual PCI bus numbering.  This implies that when only 1 PCI bus
1335	 * exists the BIOS can choose to ignore this ordering, and indeed many
1336	 * MP motherboards do ignore it.  This causes a problem when the PCI
1337	 * sub-system makes requests of the MP sub-system based on PCI bus
1338	 * numbers.	So here we look for the situation and renumber the
1339	 * busses and associated INTs in an effort to "make it right".
1340	 */
1341
1342	/* find bus 0, PCI bus, count the number of PCI busses */
1343	for (num_pci_bus = 0, x = 0; x < mp_nbusses; ++x) {
1344		if (bus_data[x].bus_id == 0) {
1345			bus_0 = x;
1346		}
1347		if (bus_data[x].bus_type == PCI) {
1348			++num_pci_bus;
1349			bus_pci = x;
1350		}
1351	}
1352	/*
1353	 * bus_0 == slot of bus with ID of 0
1354	 * bus_pci == slot of last PCI bus encountered
1355	 */
1356
1357	/* check the 1 PCI bus case for sanity */
1358	/* if it is number 0 all is well */
1359	if (num_pci_bus == 1 &&
1360	    bus_data[bus_pci].bus_id != 0) {
1361
1362		/* mis-numbered, swap with whichever bus uses slot 0 */
1363
1364		/* swap the bus entry types */
1365		bus_data[bus_pci].bus_type = bus_data[bus_0].bus_type;
1366		bus_data[bus_0].bus_type = PCI;
1367
1368		/* swap each relavant INTerrupt entry */
1369		id = bus_data[bus_pci].bus_id;
1370		for (x = 0; x < nintrs; ++x) {
1371			if (io_apic_ints[x].src_bus_id == id) {
1372				io_apic_ints[x].src_bus_id = 0;
1373			}
1374			else if (io_apic_ints[x].src_bus_id == 0) {
1375				io_apic_ints[x].src_bus_id = id;
1376			}
1377		}
1378	}
1379
1380	/* Assign IO APIC IDs.
1381	 *
1382	 * First try the existing ID. If a conflict is detected, try
1383	 * the ID in the MP table.  If a conflict is still detected, find
1384	 * a free id.
1385	 *
1386	 * We cannot use the ID_TO_IO table before all conflicts has been
1387	 * resolved and the table has been corrected.
1388	 */
1389	for (apic = 0; apic < mp_napics; ++apic) { /* For all IO APICs */
1390
1391		/* First try to use the value set by the BIOS */
1392		physid = io_apic_get_id(apic);
1393		if (io_apic_id_acceptable(apic, physid)) {
1394			if (IO_TO_ID(apic) != physid)
1395				swap_apic_id(apic, IO_TO_ID(apic), physid);
1396			continue;
1397		}
1398
1399		/* Then check if the value in the MP table is acceptable */
1400		if (io_apic_id_acceptable(apic, IO_TO_ID(apic)))
1401			continue;
1402
1403		/* Last resort, find a free APIC ID and use it */
1404		freeid = first_free_apic_id();
1405		if (freeid >= NAPICID)
1406			panic("No free physical APIC IDs found");
1407
1408		if (io_apic_id_acceptable(apic, freeid)) {
1409			swap_apic_id(apic, IO_TO_ID(apic), freeid);
1410			continue;
1411		}
1412		panic("Free physical APIC ID not usable");
1413	}
1414	fix_id_to_io_mapping();
1415
1416	/* detect and fix broken Compaq MP table */
1417	if (apic_int_type(0, 0) == -1) {
1418		printf("APIC_IO: MP table broken: 8259->APIC entry missing!\n");
1419		io_apic_ints[nintrs].int_type = 3;	/* ExtInt */
1420		io_apic_ints[nintrs].int_vector = 0xff;	/* Unassigned */
1421		/* XXX fixme, set src bus id etc, but it doesn't seem to hurt */
1422		io_apic_ints[nintrs].dst_apic_id = IO_TO_ID(0);
1423		io_apic_ints[nintrs].dst_apic_int = 0;	/* Pin 0 */
1424		nintrs++;
1425	}
1426}
1427
1428
1429/* Assign low level interrupt handlers */
1430static void
1431setup_apic_irq_mapping(void)
1432{
1433	int	x;
1434	int	int_vector;
1435
1436	/* Clear array */
1437	for (x = 0; x < APIC_INTMAPSIZE; x++) {
1438		int_to_apicintpin[x].ioapic = -1;
1439		int_to_apicintpin[x].int_pin = 0;
1440		int_to_apicintpin[x].apic_address = NULL;
1441		int_to_apicintpin[x].redirindex = 0;
1442	}
1443
1444	/* First assign ISA/EISA interrupts */
1445	for (x = 0; x < nintrs; x++) {
1446		int_vector = io_apic_ints[x].src_bus_irq;
1447		if (int_vector < APIC_INTMAPSIZE &&
1448		    io_apic_ints[x].int_vector == 0xff &&
1449		    int_to_apicintpin[int_vector].ioapic == -1 &&
1450		    (apic_int_is_bus_type(x, ISA) ||
1451		     apic_int_is_bus_type(x, EISA)) &&
1452		    io_apic_ints[x].int_type == 0) {
1453			assign_apic_irq(ID_TO_IO(io_apic_ints[x].dst_apic_id),
1454					io_apic_ints[x].dst_apic_int,
1455					int_vector);
1456		}
1457	}
1458
1459	/* Assign ExtInt entry if no ISA/EISA interrupt 0 entry */
1460	for (x = 0; x < nintrs; x++) {
1461		if (io_apic_ints[x].dst_apic_int == 0 &&
1462		    io_apic_ints[x].dst_apic_id == IO_TO_ID(0) &&
1463		    io_apic_ints[x].int_vector == 0xff &&
1464		    int_to_apicintpin[0].ioapic == -1 &&
1465		    io_apic_ints[x].int_type == 3) {
1466			assign_apic_irq(0, 0, 0);
1467			break;
1468		}
1469	}
1470	/* PCI interrupt assignment is deferred */
1471}
1472
1473
1474static int
1475processor_entry(proc_entry_ptr entry, int cpu)
1476{
1477	/* check for usability */
1478	if (!(entry->cpu_flags & PROCENTRY_FLAG_EN))
1479		return 0;
1480
1481	if(entry->apic_id >= NAPICID)
1482		panic("CPU APIC ID out of range (0..%d)", NAPICID - 1);
1483	/* check for BSP flag */
1484	if (entry->cpu_flags & PROCENTRY_FLAG_BP) {
1485		boot_cpu_id = entry->apic_id;
1486		CPU_TO_ID(0) = entry->apic_id;
1487		ID_TO_CPU(entry->apic_id) = 0;
1488		return 0;	/* its already been counted */
1489	}
1490
1491	/* add another AP to list, if less than max number of CPUs */
1492	else if (cpu < MAXCPU) {
1493		CPU_TO_ID(cpu) = entry->apic_id;
1494		ID_TO_CPU(entry->apic_id) = cpu;
1495		return 1;
1496	}
1497
1498	return 0;
1499}
1500
1501
1502static int
1503bus_entry(bus_entry_ptr entry, int bus)
1504{
1505	int     x;
1506	char    c, name[8];
1507
1508	/* encode the name into an index */
1509	for (x = 0; x < 6; ++x) {
1510		if ((c = entry->bus_type[x]) == ' ')
1511			break;
1512		name[x] = c;
1513	}
1514	name[x] = '\0';
1515
1516	if ((x = lookup_bus_type(name)) == UNKNOWN_BUSTYPE)
1517		panic("unknown bus type: '%s'", name);
1518
1519	bus_data[bus].bus_id = entry->bus_id;
1520	bus_data[bus].bus_type = x;
1521
1522	return 1;
1523}
1524
1525
1526static int
1527io_apic_entry(io_apic_entry_ptr entry, int apic)
1528{
1529	if (!(entry->apic_flags & IOAPICENTRY_FLAG_EN))
1530		return 0;
1531
1532	IO_TO_ID(apic) = entry->apic_id;
1533	if (entry->apic_id < NAPICID)
1534		ID_TO_IO(entry->apic_id) = apic;
1535
1536	return 1;
1537}
1538
1539
1540static int
1541lookup_bus_type(char *name)
1542{
1543	int     x;
1544
1545	for (x = 0; x < MAX_BUSTYPE; ++x)
1546		if (strcmp(bus_type_table[x].name, name) == 0)
1547			return bus_type_table[x].type;
1548
1549	return UNKNOWN_BUSTYPE;
1550}
1551
1552
1553static int
1554int_entry(int_entry_ptr entry, int intr)
1555{
1556	int apic;
1557
1558	io_apic_ints[intr].int_type = entry->int_type;
1559	io_apic_ints[intr].int_flags = entry->int_flags;
1560	io_apic_ints[intr].src_bus_id = entry->src_bus_id;
1561	io_apic_ints[intr].src_bus_irq = entry->src_bus_irq;
1562	if (entry->dst_apic_id == 255) {
1563		/* This signal goes to all IO APICS.  Select an IO APIC
1564		   with sufficient number of interrupt pins */
1565		for (apic = 0; apic < mp_napics; apic++)
1566			if (((io_apic_read(apic, IOAPIC_VER) &
1567			      IOART_VER_MAXREDIR) >> MAXREDIRSHIFT) >=
1568			    entry->dst_apic_int)
1569				break;
1570		if (apic < mp_napics)
1571			io_apic_ints[intr].dst_apic_id = IO_TO_ID(apic);
1572		else
1573			io_apic_ints[intr].dst_apic_id = entry->dst_apic_id;
1574	} else
1575		io_apic_ints[intr].dst_apic_id = entry->dst_apic_id;
1576	io_apic_ints[intr].dst_apic_int = entry->dst_apic_int;
1577
1578	return 1;
1579}
1580
1581
1582static int
1583apic_int_is_bus_type(int intr, int bus_type)
1584{
1585	int     bus;
1586
1587	for (bus = 0; bus < mp_nbusses; ++bus)
1588		if ((bus_data[bus].bus_id == io_apic_ints[intr].src_bus_id)
1589		    && ((int) bus_data[bus].bus_type == bus_type))
1590			return 1;
1591
1592	return 0;
1593}
1594
1595
1596/*
1597 * Given a traditional ISA INT mask, return an APIC mask.
1598 */
1599u_int
1600isa_apic_mask(u_int isa_mask)
1601{
1602	int isa_irq;
1603	int apic_pin;
1604
1605#if defined(SKIP_IRQ15_REDIRECT)
1606	if (isa_mask == (1 << 15)) {
1607		printf("skipping ISA IRQ15 redirect\n");
1608		return isa_mask;
1609	}
1610#endif  /* SKIP_IRQ15_REDIRECT */
1611
1612	isa_irq = ffs(isa_mask);		/* find its bit position */
1613	if (isa_irq == 0)			/* doesn't exist */
1614		return 0;
1615	--isa_irq;				/* make it zero based */
1616
1617	apic_pin = isa_apic_irq(isa_irq);	/* look for APIC connection */
1618	if (apic_pin == -1)
1619		return 0;
1620
1621	return (1 << apic_pin);			/* convert pin# to a mask */
1622}
1623
1624
1625/*
1626 * Determine which APIC pin an ISA/EISA INT is attached to.
1627 */
1628#define INTTYPE(I)	(io_apic_ints[(I)].int_type)
1629#define INTPIN(I)	(io_apic_ints[(I)].dst_apic_int)
1630#define INTIRQ(I)	(io_apic_ints[(I)].int_vector)
1631#define INTAPIC(I)	(ID_TO_IO(io_apic_ints[(I)].dst_apic_id))
1632
1633#define SRCBUSIRQ(I)	(io_apic_ints[(I)].src_bus_irq)
1634int
1635isa_apic_irq(int isa_irq)
1636{
1637	int     intr;
1638
1639	for (intr = 0; intr < nintrs; ++intr) {		/* check each record */
1640		if (INTTYPE(intr) == 0) {		/* standard INT */
1641			if (SRCBUSIRQ(intr) == isa_irq) {
1642				if (apic_int_is_bus_type(intr, ISA) ||
1643			            apic_int_is_bus_type(intr, EISA)) {
1644					if (INTIRQ(intr) == 0xff)
1645						return -1; /* unassigned */
1646					return INTIRQ(intr);	/* found */
1647				}
1648			}
1649		}
1650	}
1651	return -1;					/* NOT found */
1652}
1653
1654
1655/*
1656 * Determine which APIC pin a PCI INT is attached to.
1657 */
1658#define SRCBUSID(I)	(io_apic_ints[(I)].src_bus_id)
1659#define SRCBUSDEVICE(I)	((io_apic_ints[(I)].src_bus_irq >> 2) & 0x1f)
1660#define SRCBUSLINE(I)	(io_apic_ints[(I)].src_bus_irq & 0x03)
1661int
1662pci_apic_irq(int pciBus, int pciDevice, int pciInt)
1663{
1664	int     intr;
1665
1666	--pciInt;					/* zero based */
1667
1668	for (intr = 0; intr < nintrs; ++intr)		/* check each record */
1669		if ((INTTYPE(intr) == 0)		/* standard INT */
1670		    && (SRCBUSID(intr) == pciBus)
1671		    && (SRCBUSDEVICE(intr) == pciDevice)
1672		    && (SRCBUSLINE(intr) == pciInt))	/* a candidate IRQ */
1673			if (apic_int_is_bus_type(intr, PCI)) {
1674				if (INTIRQ(intr) == 0xff)
1675					allocate_apic_irq(intr);
1676				if (INTIRQ(intr) == 0xff)
1677					return -1;	/* unassigned */
1678				return INTIRQ(intr);	/* exact match */
1679			}
1680
1681	return -1;					/* NOT found */
1682}
1683
1684int
1685next_apic_irq(int irq)
1686{
1687	int intr, ointr;
1688	int bus, bustype;
1689
1690	bus = 0;
1691	bustype = 0;
1692	for (intr = 0; intr < nintrs; intr++) {
1693		if (INTIRQ(intr) != irq || INTTYPE(intr) != 0)
1694			continue;
1695		bus = SRCBUSID(intr);
1696		bustype = apic_bus_type(bus);
1697		if (bustype != ISA &&
1698		    bustype != EISA &&
1699		    bustype != PCI)
1700			continue;
1701		break;
1702	}
1703	if (intr >= nintrs) {
1704		return -1;
1705	}
1706	for (ointr = intr + 1; ointr < nintrs; ointr++) {
1707		if (INTTYPE(ointr) != 0)
1708			continue;
1709		if (bus != SRCBUSID(ointr))
1710			continue;
1711		if (bustype == PCI) {
1712			if (SRCBUSDEVICE(intr) != SRCBUSDEVICE(ointr))
1713				continue;
1714			if (SRCBUSLINE(intr) != SRCBUSLINE(ointr))
1715				continue;
1716		}
1717		if (bustype == ISA || bustype == EISA) {
1718			if (SRCBUSIRQ(intr) != SRCBUSIRQ(ointr))
1719				continue;
1720		}
1721		if (INTPIN(intr) == INTPIN(ointr))
1722			continue;
1723		break;
1724	}
1725	if (ointr >= nintrs) {
1726		return -1;
1727	}
1728	return INTIRQ(ointr);
1729}
1730#undef SRCBUSLINE
1731#undef SRCBUSDEVICE
1732#undef SRCBUSID
1733#undef SRCBUSIRQ
1734
1735#undef INTPIN
1736#undef INTIRQ
1737#undef INTAPIC
1738#undef INTTYPE
1739
1740
1741/*
1742 * Reprogram the MB chipset to NOT redirect an ISA INTerrupt.
1743 *
1744 * XXX FIXME:
1745 *  Exactly what this means is unclear at this point.  It is a solution
1746 *  for motherboards that redirect the MBIRQ0 pin.  Generically a motherboard
1747 *  could route any of the ISA INTs to upper (>15) IRQ values.  But most would
1748 *  NOT be redirected via MBIRQ0, thus "undirect()ing" them would NOT be an
1749 *  option.
1750 */
1751int
1752undirect_isa_irq(int rirq)
1753{
1754#if defined(READY)
1755	if (bootverbose)
1756	    printf("Freeing redirected ISA irq %d.\n", rirq);
1757	/** FIXME: tickle the MB redirector chip */
1758	return -1;
1759#else
1760	if (bootverbose)
1761	    printf("Freeing (NOT implemented) redirected ISA irq %d.\n", rirq);
1762	return 0;
1763#endif  /* READY */
1764}
1765
1766
1767/*
1768 * Reprogram the MB chipset to NOT redirect a PCI INTerrupt
1769 */
1770int
1771undirect_pci_irq(int rirq)
1772{
1773#if defined(READY)
1774	if (bootverbose)
1775		printf("Freeing redirected PCI irq %d.\n", rirq);
1776
1777	/** FIXME: tickle the MB redirector chip */
1778	return -1;
1779#else
1780	if (bootverbose)
1781		printf("Freeing (NOT implemented) redirected PCI irq %d.\n",
1782		       rirq);
1783	return 0;
1784#endif  /* READY */
1785}
1786
1787
1788/*
1789 * given a bus ID, return:
1790 *  the bus type if found
1791 *  -1 if NOT found
1792 */
1793int
1794apic_bus_type(int id)
1795{
1796	int     x;
1797
1798	for (x = 0; x < mp_nbusses; ++x)
1799		if (bus_data[x].bus_id == id)
1800			return bus_data[x].bus_type;
1801
1802	return -1;
1803}
1804
1805
1806/*
1807 * given a LOGICAL APIC# and pin#, return:
1808 *  the associated src bus ID if found
1809 *  -1 if NOT found
1810 */
1811int
1812apic_src_bus_id(int apic, int pin)
1813{
1814	int     x;
1815
1816	/* search each of the possible INTerrupt sources */
1817	for (x = 0; x < nintrs; ++x)
1818		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1819		    (pin == io_apic_ints[x].dst_apic_int))
1820			return (io_apic_ints[x].src_bus_id);
1821
1822	return -1;		/* NOT found */
1823}
1824
1825
1826/*
1827 * given a LOGICAL APIC# and pin#, return:
1828 *  the associated src bus IRQ if found
1829 *  -1 if NOT found
1830 */
1831int
1832apic_src_bus_irq(int apic, int pin)
1833{
1834	int     x;
1835
1836	for (x = 0; x < nintrs; x++)
1837		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1838		    (pin == io_apic_ints[x].dst_apic_int))
1839			return (io_apic_ints[x].src_bus_irq);
1840
1841	return -1;		/* NOT found */
1842}
1843
1844
1845/*
1846 * given a LOGICAL APIC# and pin#, return:
1847 *  the associated INTerrupt type if found
1848 *  -1 if NOT found
1849 */
1850int
1851apic_int_type(int apic, int pin)
1852{
1853	int     x;
1854
1855	/* search each of the possible INTerrupt sources */
1856	for (x = 0; x < nintrs; ++x)
1857		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1858		    (pin == io_apic_ints[x].dst_apic_int))
1859			return (io_apic_ints[x].int_type);
1860
1861	return -1;		/* NOT found */
1862}
1863
1864int
1865apic_irq(int apic, int pin)
1866{
1867	int x;
1868	int res;
1869
1870	for (x = 0; x < nintrs; ++x)
1871		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1872		    (pin == io_apic_ints[x].dst_apic_int)) {
1873			res = io_apic_ints[x].int_vector;
1874			if (res == 0xff)
1875				return -1;
1876			if (apic != int_to_apicintpin[res].ioapic)
1877				panic("apic_irq: inconsistent table");
1878			if (pin != int_to_apicintpin[res].int_pin)
1879				panic("apic_irq inconsistent table (2)");
1880			return res;
1881		}
1882	return -1;
1883}
1884
1885
1886/*
1887 * given a LOGICAL APIC# and pin#, return:
1888 *  the associated trigger mode if found
1889 *  -1 if NOT found
1890 */
1891int
1892apic_trigger(int apic, int pin)
1893{
1894	int     x;
1895
1896	/* search each of the possible INTerrupt sources */
1897	for (x = 0; x < nintrs; ++x)
1898		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1899		    (pin == io_apic_ints[x].dst_apic_int))
1900			return ((io_apic_ints[x].int_flags >> 2) & 0x03);
1901
1902	return -1;		/* NOT found */
1903}
1904
1905
1906/*
1907 * given a LOGICAL APIC# and pin#, return:
1908 *  the associated 'active' level if found
1909 *  -1 if NOT found
1910 */
1911int
1912apic_polarity(int apic, int pin)
1913{
1914	int     x;
1915
1916	/* search each of the possible INTerrupt sources */
1917	for (x = 0; x < nintrs; ++x)
1918		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1919		    (pin == io_apic_ints[x].dst_apic_int))
1920			return (io_apic_ints[x].int_flags & 0x03);
1921
1922	return -1;		/* NOT found */
1923}
1924
1925
1926/*
1927 * set data according to MP defaults
1928 * FIXME: probably not complete yet...
1929 */
1930static void
1931default_mp_table(int type)
1932{
1933	int     ap_cpu_id;
1934#if defined(APIC_IO)
1935	int     io_apic_id;
1936	int     pin;
1937#endif	/* APIC_IO */
1938
1939#if 0
1940	printf("  MP default config type: %d\n", type);
1941	switch (type) {
1942	case 1:
1943		printf("   bus: ISA, APIC: 82489DX\n");
1944		break;
1945	case 2:
1946		printf("   bus: EISA, APIC: 82489DX\n");
1947		break;
1948	case 3:
1949		printf("   bus: EISA, APIC: 82489DX\n");
1950		break;
1951	case 4:
1952		printf("   bus: MCA, APIC: 82489DX\n");
1953		break;
1954	case 5:
1955		printf("   bus: ISA+PCI, APIC: Integrated\n");
1956		break;
1957	case 6:
1958		printf("   bus: EISA+PCI, APIC: Integrated\n");
1959		break;
1960	case 7:
1961		printf("   bus: MCA+PCI, APIC: Integrated\n");
1962		break;
1963	default:
1964		printf("   future type\n");
1965		break;
1966		/* NOTREACHED */
1967	}
1968#endif	/* 0 */
1969
1970	boot_cpu_id = (lapic.id & APIC_ID_MASK) >> 24;
1971	ap_cpu_id = (boot_cpu_id == 0) ? 1 : 0;
1972
1973	/* BSP */
1974	CPU_TO_ID(0) = boot_cpu_id;
1975	ID_TO_CPU(boot_cpu_id) = 0;
1976
1977	/* one and only AP */
1978	CPU_TO_ID(1) = ap_cpu_id;
1979	ID_TO_CPU(ap_cpu_id) = 1;
1980
1981#if defined(APIC_IO)
1982	/* one and only IO APIC */
1983	io_apic_id = (io_apic_read(0, IOAPIC_ID) & APIC_ID_MASK) >> 24;
1984
1985	/*
1986	 * sanity check, refer to MP spec section 3.6.6, last paragraph
1987	 * necessary as some hardware isn't properly setting up the IO APIC
1988	 */
1989#if defined(REALLY_ANAL_IOAPICID_VALUE)
1990	if (io_apic_id != 2) {
1991#else
1992	if ((io_apic_id == 0) || (io_apic_id == 1) || (io_apic_id == 15)) {
1993#endif	/* REALLY_ANAL_IOAPICID_VALUE */
1994		io_apic_set_id(0, 2);
1995		io_apic_id = 2;
1996	}
1997	IO_TO_ID(0) = io_apic_id;
1998	ID_TO_IO(io_apic_id) = 0;
1999#endif	/* APIC_IO */
2000
2001	/* fill out bus entries */
2002	switch (type) {
2003	case 1:
2004	case 2:
2005	case 3:
2006	case 4:
2007	case 5:
2008	case 6:
2009	case 7:
2010		bus_data[0].bus_id = default_data[type - 1][1];
2011		bus_data[0].bus_type = default_data[type - 1][2];
2012		bus_data[1].bus_id = default_data[type - 1][3];
2013		bus_data[1].bus_type = default_data[type - 1][4];
2014		break;
2015
2016	/* case 4: case 7:		   MCA NOT supported */
2017	default:		/* illegal/reserved */
2018		panic("BAD default MP config: %d", type);
2019		/* NOTREACHED */
2020	}
2021
2022#if defined(APIC_IO)
2023	/* general cases from MP v1.4, table 5-2 */
2024	for (pin = 0; pin < 16; ++pin) {
2025		io_apic_ints[pin].int_type = 0;
2026		io_apic_ints[pin].int_flags = 0x05;	/* edge/active-hi */
2027		io_apic_ints[pin].src_bus_id = 0;
2028		io_apic_ints[pin].src_bus_irq = pin;	/* IRQ2 caught below */
2029		io_apic_ints[pin].dst_apic_id = io_apic_id;
2030		io_apic_ints[pin].dst_apic_int = pin;	/* 1-to-1 */
2031	}
2032
2033	/* special cases from MP v1.4, table 5-2 */
2034	if (type == 2) {
2035		io_apic_ints[2].int_type = 0xff;	/* N/C */
2036		io_apic_ints[13].int_type = 0xff;	/* N/C */
2037#if !defined(APIC_MIXED_MODE)
2038		/** FIXME: ??? */
2039		panic("sorry, can't support type 2 default yet");
2040#endif	/* APIC_MIXED_MODE */
2041	}
2042	else
2043		io_apic_ints[2].src_bus_irq = 0;	/* ISA IRQ0 is on APIC INT 2 */
2044
2045	if (type == 7)
2046		io_apic_ints[0].int_type = 0xff;	/* N/C */
2047	else
2048		io_apic_ints[0].int_type = 3;	/* vectored 8259 */
2049#endif	/* APIC_IO */
2050}
2051
2052
2053/*
2054 * start each AP in our list
2055 */
2056static int
2057start_all_aps(u_int boot_addr)
2058{
2059	int     x, i, pg;
2060#ifndef PC98
2061	u_char  mpbiosreason;
2062#endif
2063	u_long  mpbioswarmvec;
2064	struct pcpu *pc;
2065	char *stack;
2066	uintptr_t kptbase;
2067
2068	POSTCODE(START_ALL_APS_POST);
2069
2070	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
2071
2072	/* initialize BSP's local APIC */
2073	apic_initialize();
2074	bsp_apic_ready = 1;
2075
2076	/* install the AP 1st level boot code */
2077	install_ap_tramp(boot_addr);
2078
2079
2080	/* save the current value of the warm-start vector */
2081	mpbioswarmvec = *((u_long *) WARMBOOT_OFF);
2082#ifndef PC98
2083	outb(CMOS_REG, BIOS_RESET);
2084	mpbiosreason = inb(CMOS_DATA);
2085#endif
2086
2087	/* set up temporary P==V mapping for AP boot */
2088	/* XXX this is a hack, we should boot the AP on its own stack/PTD */
2089	kptbase = (uintptr_t)(void *)KPTphys;
2090	for (x = 0; x < NKPT; x++)
2091		PTD[x] = (pd_entry_t)(PG_V | PG_RW |
2092		    ((kptbase + x * PAGE_SIZE) & PG_FRAME));
2093	invltlb();
2094
2095	/* start each AP */
2096	for (x = 1; x <= mp_naps; ++x) {
2097
2098		/* This is a bit verbose, it will go away soon.  */
2099
2100		/* first page of AP's private space */
2101		pg = x * i386_btop(sizeof(struct privatespace));
2102
2103		/* allocate a new private data page */
2104		pc = (struct pcpu *)kmem_alloc(kernel_map, PAGE_SIZE);
2105
2106		/* wire it into the private page table page */
2107		SMPpt[pg] = (pt_entry_t)(PG_V | PG_RW | vtophys(pc));
2108
2109		/* allocate and set up an idle stack data page */
2110		stack = (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE); /* XXXKSE */
2111		for (i = 0; i < KSTACK_PAGES; i++)
2112			SMPpt[pg + 1 + i] = (pt_entry_t)
2113			    (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack));
2114
2115		/* prime data page for it to use */
2116		pcpu_init(pc, x, sizeof(struct pcpu));
2117
2118		/* setup a vector to our boot code */
2119		*((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
2120		*((volatile u_short *) WARMBOOT_SEG) = (boot_addr >> 4);
2121#ifndef PC98
2122		outb(CMOS_REG, BIOS_RESET);
2123		outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
2124#endif
2125
2126		bootSTK = &SMP_prvspace[x].idlekstack[KSTACK_PAGES * PAGE_SIZE];
2127		bootAP = x;
2128
2129		/* attempt to start the Application Processor */
2130		CHECK_INIT(99);	/* setup checkpoints */
2131		if (!start_ap(x, boot_addr)) {
2132			printf("AP #%d (PHY# %d) failed!\n", x, CPU_TO_ID(x));
2133			CHECK_PRINT("trace");	/* show checkpoints */
2134			/* better panic as the AP may be running loose */
2135			printf("panic y/n? [y] ");
2136			if (cngetc() != 'n')
2137				panic("bye-bye");
2138		}
2139		CHECK_PRINT("trace");		/* show checkpoints */
2140
2141		/* record its version info */
2142		cpu_apic_versions[x] = cpu_apic_versions[0];
2143
2144		all_cpus |= (1 << x);		/* record AP in CPU map */
2145	}
2146
2147	/* build our map of 'other' CPUs */
2148	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
2149
2150	/* fill in our (BSP) APIC version */
2151	cpu_apic_versions[0] = lapic.version;
2152
2153	/* restore the warmstart vector */
2154	*(u_long *) WARMBOOT_OFF = mpbioswarmvec;
2155#ifndef PC98
2156	outb(CMOS_REG, BIOS_RESET);
2157	outb(CMOS_DATA, mpbiosreason);
2158#endif
2159
2160	/*
2161	 * Set up the idle context for the BSP.  Similar to above except
2162	 * that some was done by locore, some by pmap.c and some is implicit
2163	 * because the BSP is cpu#0 and the page is initially zero, and also
2164	 * because we can refer to variables by name on the BSP..
2165	 */
2166
2167	/* Allocate and setup BSP idle stack */
2168	stack = (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE);
2169	for (i = 0; i < KSTACK_PAGES; i++)
2170		SMPpt[1 + i] = (pt_entry_t)
2171		    (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack));
2172
2173	for (x = 0; x < NKPT; x++)
2174		PTD[x] = 0;
2175
2176	/* number of APs actually started */
2177	return mp_ncpus - 1;
2178}
2179
2180
2181/*
2182 * load the 1st level AP boot code into base memory.
2183 */
2184
2185/* targets for relocation */
2186extern void bigJump(void);
2187extern void bootCodeSeg(void);
2188extern void bootDataSeg(void);
2189extern void MPentry(void);
2190extern u_int MP_GDT;
2191extern u_int mp_gdtbase;
2192
2193static void
2194install_ap_tramp(u_int boot_addr)
2195{
2196	int     x;
2197	int     size = *(int *) ((u_long) & bootMP_size);
2198	u_char *src = (u_char *) ((u_long) bootMP);
2199	u_char *dst = (u_char *) boot_addr + KERNBASE;
2200	u_int   boot_base = (u_int) bootMP;
2201	u_int8_t *dst8;
2202	u_int16_t *dst16;
2203	u_int32_t *dst32;
2204
2205	POSTCODE(INSTALL_AP_TRAMP_POST);
2206
2207	for (x = 0; x < size; ++x)
2208		*dst++ = *src++;
2209
2210	/*
2211	 * modify addresses in code we just moved to basemem. unfortunately we
2212	 * need fairly detailed info about mpboot.s for this to work.  changes
2213	 * to mpboot.s might require changes here.
2214	 */
2215
2216	/* boot code is located in KERNEL space */
2217	dst = (u_char *) boot_addr + KERNBASE;
2218
2219	/* modify the lgdt arg */
2220	dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base));
2221	*dst32 = boot_addr + ((u_int) & MP_GDT - boot_base);
2222
2223	/* modify the ljmp target for MPentry() */
2224	dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1);
2225	*dst32 = ((u_int) MPentry - KERNBASE);
2226
2227	/* modify the target for boot code segment */
2228	dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base));
2229	dst8 = (u_int8_t *) (dst16 + 1);
2230	*dst16 = (u_int) boot_addr & 0xffff;
2231	*dst8 = ((u_int) boot_addr >> 16) & 0xff;
2232
2233	/* modify the target for boot data segment */
2234	dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base));
2235	dst8 = (u_int8_t *) (dst16 + 1);
2236	*dst16 = (u_int) boot_addr & 0xffff;
2237	*dst8 = ((u_int) boot_addr >> 16) & 0xff;
2238}
2239
2240
2241/*
2242 * this function starts the AP (application processor) identified
2243 * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
2244 * to accomplish this.  This is necessary because of the nuances
2245 * of the different hardware we might encounter.  It ain't pretty,
2246 * but it seems to work.
2247 */
2248static int
2249start_ap(int logical_cpu, u_int boot_addr)
2250{
2251	int     physical_cpu;
2252	int     vector;
2253	int     cpus;
2254	u_long  icr_lo, icr_hi;
2255
2256	POSTCODE(START_AP_POST);
2257
2258	/* get the PHYSICAL APIC ID# */
2259	physical_cpu = CPU_TO_ID(logical_cpu);
2260
2261	/* calculate the vector */
2262	vector = (boot_addr >> 12) & 0xff;
2263
2264	/* used as a watchpoint to signal AP startup */
2265	cpus = mp_ncpus;
2266
2267	/*
2268	 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
2269	 * and running the target CPU. OR this INIT IPI might be latched (P5
2270	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
2271	 * ignored.
2272	 */
2273
2274	/* setup the address for the target AP */
2275	icr_hi = lapic.icr_hi & ~APIC_ID_MASK;
2276	icr_hi |= (physical_cpu << 24);
2277	lapic.icr_hi = icr_hi;
2278
2279	/* setup common fields for subsequent IPIs */
2280	icr_lo = lapic.icr_lo & APIC_ICRLO_RESV_MASK;
2281	icr_lo |= APIC_DESTMODE_PHY;
2282
2283	/* do an INIT IPI: assert RESET */
2284	lapic.icr_lo = icr_lo | APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
2285	    APIC_LEVEL_ASSERT | APIC_DELMODE_INIT;
2286
2287	/* wait for pending status end */
2288	while (lapic.icr_lo & APIC_DELSTAT_MASK)
2289		 /* spin */ ;
2290
2291	/* do an INIT IPI: deassert RESET */
2292	lapic.icr_lo = icr_lo | APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL |
2293	    APIC_LEVEL_DEASSERT | APIC_DELMODE_INIT;
2294
2295	/* wait for pending status end */
2296	u_sleep(10000);		/* wait ~10mS */
2297	while (lapic.icr_lo & APIC_DELSTAT_MASK)
2298		 /* spin */ ;
2299
2300	/*
2301	 * next we do a STARTUP IPI: the previous INIT IPI might still be
2302	 * latched, (P5 bug) this 1st STARTUP would then terminate
2303	 * immediately, and the previously started INIT IPI would continue. OR
2304	 * the previous INIT IPI has already run. and this STARTUP IPI will
2305	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
2306	 * will run.
2307	 */
2308
2309	/* do a STARTUP IPI */
2310	lapic.icr_lo = icr_lo | APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
2311	    APIC_LEVEL_DEASSERT | APIC_DELMODE_STARTUP | vector;
2312	while (lapic.icr_lo & APIC_DELSTAT_MASK)
2313		 /* spin */ ;
2314	u_sleep(200);		/* wait ~200uS */
2315
2316	/*
2317	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
2318	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
2319	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
2320	 * recognized after hardware RESET or INIT IPI.
2321	 */
2322
2323	lapic.icr_lo = icr_lo | APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
2324	    APIC_LEVEL_DEASSERT | APIC_DELMODE_STARTUP | vector;
2325	while (lapic.icr_lo & APIC_DELSTAT_MASK)
2326		 /* spin */ ;
2327	u_sleep(200);		/* wait ~200uS */
2328
2329	/* wait for it to start */
2330	set_apic_timer(5000000);/* == 5 seconds */
2331	while (read_apic_timer())
2332		if (mp_ncpus > cpus)
2333			return 1;	/* return SUCCESS */
2334
2335	return 0;		/* return FAILURE */
2336}
2337
2338#if defined(APIC_IO)
2339
2340#ifdef COUNT_XINVLTLB_HITS
2341u_int xhits_gbl[MAXCPU];
2342u_int xhits_pg[MAXCPU];
2343u_int xhits_rng[MAXCPU];
2344SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
2345SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
2346    sizeof(xhits_gbl), "IU", "");
2347SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
2348    sizeof(xhits_pg), "IU", "");
2349SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
2350    sizeof(xhits_rng), "IU", "");
2351
2352u_int ipi_global;
2353u_int ipi_page;
2354u_int ipi_range;
2355u_int ipi_range_size;
2356SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
2357SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
2358SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
2359SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
2360    0, "");
2361
2362u_int ipi_masked_global;
2363u_int ipi_masked_page;
2364u_int ipi_masked_range;
2365u_int ipi_masked_range_size;
2366SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
2367    &ipi_masked_global, 0, "");
2368SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
2369    &ipi_masked_page, 0, "");
2370SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
2371    &ipi_masked_range, 0, "");
2372SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
2373    &ipi_masked_range_size, 0, "");
2374#endif
2375
2376/*
2377 * Flush the TLB on all other CPU's
2378 */
2379static void
2380smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
2381{
2382	u_int ncpu;
2383	register_t eflags;
2384
2385	ncpu = mp_ncpus - 1;	/* does not shootdown self */
2386	if (ncpu < 1)
2387		return;		/* no other cpus */
2388	eflags = read_eflags();
2389	if ((eflags & PSL_I) == 0)
2390		panic("absolutely cannot call smp_ipi_shootdown with interrupts already disabled");
2391	mtx_lock_spin(&smp_tlb_mtx);
2392	smp_tlb_addr1 = addr1;
2393	smp_tlb_addr2 = addr2;
2394	atomic_store_rel_int(&smp_tlb_wait, 0);
2395	ipi_all_but_self(vector);
2396	while (smp_tlb_wait < ncpu)
2397		ia32_pause();
2398	mtx_unlock_spin(&smp_tlb_mtx);
2399}
2400
2401/*
2402 * This is about as magic as it gets.  fortune(1) has got similar code
2403 * for reversing bits in a word.  Who thinks up this stuff??
2404 *
2405 * Yes, it does appear to be consistently faster than:
2406 * while (i = ffs(m)) {
2407 *	m >>= i;
2408 *	bits++;
2409 * }
2410 * and
2411 * while (lsb = (m & -m)) {	// This is magic too
2412 * 	m &= ~lsb;		// or: m ^= lsb
2413 *	bits++;
2414 * }
2415 * Both of these latter forms do some very strange things on gcc-3.1 with
2416 * -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2.
2417 * There is probably an SSE or MMX popcnt instruction.
2418 *
2419 * I wonder if this should be in libkern?
2420 *
2421 * XXX Stop the presses!  Another one:
2422 * static __inline u_int32_t
2423 * popcnt1(u_int32_t v)
2424 * {
2425 *	v -= ((v >> 1) & 0x55555555);
2426 *	v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
2427 *	v = (v + (v >> 4)) & 0x0F0F0F0F;
2428 *	return (v * 0x01010101) >> 24;
2429 * }
2430 * The downside is that it has a multiply.  With a pentium3 with
2431 * -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use
2432 * an imull, and in that case it is faster.  In most other cases
2433 * it appears slightly slower.
2434 */
2435static __inline u_int32_t
2436popcnt(u_int32_t m)
2437{
2438
2439	m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1);
2440	m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2);
2441	m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4);
2442	m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8);
2443	m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16);
2444	return m;
2445}
2446
2447static void
2448smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
2449{
2450	int ncpu, othercpus;
2451	register_t eflags;
2452
2453	othercpus = mp_ncpus - 1;
2454	if (mask == (u_int)-1) {
2455		ncpu = othercpus;
2456		if (ncpu < 1)
2457			return;
2458	} else {
2459		mask &= ~PCPU_GET(cpumask);
2460		if (mask == 0)
2461			return;
2462		ncpu = popcnt(mask);
2463		if (ncpu > othercpus) {
2464			/* XXX this should be a panic offence */
2465			printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
2466			    ncpu, othercpus);
2467			ncpu = othercpus;
2468		}
2469		/* XXX should be a panic, implied by mask == 0 above */
2470		if (ncpu < 1)
2471			return;
2472	}
2473	eflags = read_eflags();
2474	if ((eflags & PSL_I) == 0)
2475		panic("absolutely cannot call smp_targeted_ipi_shootdown with interrupts already disabled");
2476	mtx_lock_spin(&smp_tlb_mtx);
2477	smp_tlb_addr1 = addr1;
2478	smp_tlb_addr2 = addr2;
2479	atomic_store_rel_int(&smp_tlb_wait, 0);
2480	if (mask == (u_int)-1)
2481		ipi_all_but_self(vector);
2482	else
2483		ipi_selected(mask, vector);
2484	while (smp_tlb_wait < ncpu)
2485		ia32_pause();
2486	mtx_unlock_spin(&smp_tlb_mtx);
2487}
2488#endif
2489
2490void
2491smp_invltlb(void)
2492{
2493#if defined(APIC_IO)
2494	if (smp_started) {
2495		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
2496#ifdef COUNT_XINVLTLB_HITS
2497		ipi_global++;
2498#endif
2499	}
2500#endif  /* APIC_IO */
2501}
2502
2503void
2504smp_invlpg(vm_offset_t addr)
2505{
2506#if defined(APIC_IO)
2507	if (smp_started) {
2508		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
2509#ifdef COUNT_XINVLTLB_HITS
2510		ipi_page++;
2511#endif
2512	}
2513#endif  /* APIC_IO */
2514}
2515
2516void
2517smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
2518{
2519#if defined(APIC_IO)
2520	if (smp_started) {
2521		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
2522#ifdef COUNT_XINVLTLB_HITS
2523		ipi_range++;
2524		ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
2525#endif
2526	}
2527#endif  /* APIC_IO */
2528}
2529
2530void
2531smp_masked_invltlb(u_int mask)
2532{
2533#if defined(APIC_IO)
2534	if (smp_started) {
2535		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
2536#ifdef COUNT_XINVLTLB_HITS
2537		ipi_masked_global++;
2538#endif
2539	}
2540#endif  /* APIC_IO */
2541}
2542
2543void
2544smp_masked_invlpg(u_int mask, vm_offset_t addr)
2545{
2546#if defined(APIC_IO)
2547	if (smp_started) {
2548		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
2549#ifdef COUNT_XINVLTLB_HITS
2550		ipi_masked_page++;
2551#endif
2552	}
2553#endif  /* APIC_IO */
2554}
2555
2556void
2557smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2)
2558{
2559#if defined(APIC_IO)
2560	if (smp_started) {
2561		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
2562#ifdef COUNT_XINVLTLB_HITS
2563		ipi_masked_range++;
2564		ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;
2565#endif
2566	}
2567#endif  /* APIC_IO */
2568}
2569
2570
2571/*
2572 * This is called once the rest of the system is up and running and we're
2573 * ready to let the AP's out of the pen.
2574 */
2575void
2576ap_init(void)
2577{
2578	u_int	apic_id;
2579
2580	/* spin until all the AP's are ready */
2581	while (!aps_ready)
2582		ia32_pause();
2583
2584	/* BSP may have changed PTD while we were waiting */
2585	invltlb();
2586
2587#if defined(I586_CPU) && !defined(NO_F00F_HACK)
2588	lidt(&r_idt);
2589#endif
2590
2591	/* set up CPU registers and state */
2592	cpu_setregs();
2593
2594	/* set up FPU state on the AP */
2595	npxinit(__INITIAL_NPXCW__);
2596
2597	/* set up SSE registers */
2598	enable_sse();
2599
2600	/* A quick check from sanity claus */
2601	apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]);
2602	if (PCPU_GET(cpuid) != apic_id) {
2603		printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
2604		printf("SMP: apic_id = %d\n", apic_id);
2605		printf("PTD[MPPTDI] = %#jx\n", (uintmax_t)PTD[MPPTDI]);
2606		panic("cpuid mismatch! boom!!");
2607	}
2608
2609	/* Init local apic for irq's */
2610	apic_initialize();
2611
2612	/* Set memory range attributes for this CPU to match the BSP */
2613	mem_range_AP_init();
2614
2615	mtx_lock_spin(&ap_boot_mtx);
2616
2617	smp_cpus++;
2618
2619	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid));
2620	printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
2621
2622	/* Build our map of 'other' CPUs. */
2623	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
2624
2625	if (bootverbose)
2626		apic_dump("ap_init()");
2627
2628	if (smp_cpus == mp_ncpus) {
2629		/* enable IPI's, tlb shootdown, freezes etc */
2630		atomic_store_rel_int(&smp_started, 1);
2631		smp_active = 1;	 /* historic */
2632	}
2633
2634	mtx_unlock_spin(&ap_boot_mtx);
2635
2636	/* wait until all the AP's are up */
2637	while (smp_started == 0)
2638		ia32_pause();
2639
2640	/* ok, now grab sched_lock and enter the scheduler */
2641	mtx_lock_spin(&sched_lock);
2642
2643	binuptime(PCPU_PTR(switchtime));
2644	PCPU_SET(switchticks, ticks);
2645
2646	cpu_throw(NULL, choosethread());	/* doesn't return */
2647
2648	panic("scheduler returned us to %s", __func__);
2649}
2650
2651/*
2652 * For statclock, we send an IPI to all CPU's to have them call this
2653 * function.
2654 *
2655 * WARNING! unpend() will call statclock() directly and skip this
2656 * routine.
2657 */
2658void
2659forwarded_statclock(struct clockframe frame)
2660{
2661
2662	if (profprocs != 0)
2663		profclock(&frame);
2664	if (pscnt == psdiv)
2665		statclock(&frame);
2666}
2667
2668void
2669forward_statclock(void)
2670{
2671	int map;
2672
2673	CTR0(KTR_SMP, "forward_statclock");
2674
2675	if (!smp_started || cold || panicstr)
2676		return;
2677
2678	map = PCPU_GET(other_cpus) & ~(stopped_cpus|hlt_cpus_mask);
2679	if (map != 0)
2680		ipi_selected(map, IPI_STATCLOCK);
2681}
2682
2683/*
2684 * For each hardclock(), we send an IPI to all other CPU's to have them
2685 * execute this function.  It would be nice to reduce contention on
2686 * sched_lock if we could simply peek at the CPU to determine the user/kernel
2687 * state and call hardclock_process() on the CPU receiving the clock interrupt
2688 * and then just use a simple IPI to handle any ast's if needed.
2689 *
2690 * WARNING! unpend() will call hardclock_process() directly and skip this
2691 * routine.
2692 */
2693void
2694forwarded_hardclock(struct clockframe frame)
2695{
2696
2697	hardclock_process(&frame);
2698}
2699
2700void
2701forward_hardclock(void)
2702{
2703	u_int map;
2704
2705	CTR0(KTR_SMP, "forward_hardclock");
2706
2707	if (!smp_started || cold || panicstr)
2708		return;
2709
2710	map = PCPU_GET(other_cpus) & ~(stopped_cpus|hlt_cpus_mask);
2711	if (map != 0)
2712		ipi_selected(map, IPI_HARDCLOCK);
2713}
2714
2715#ifdef APIC_INTR_REORDER
2716/*
2717 *	Maintain mapping from softintr vector to isr bit in local apic.
2718 */
2719void
2720set_lapic_isrloc(int intr, int vector)
2721{
2722	if (intr < 0 || intr > 32)
2723		panic("set_apic_isrloc: bad intr argument: %d",intr);
2724	if (vector < ICU_OFFSET || vector > 255)
2725		panic("set_apic_isrloc: bad vector argument: %d",vector);
2726	apic_isrbit_location[intr].location = &lapic.isr0 + ((vector>>5)<<2);
2727	apic_isrbit_location[intr].bit = (1<<(vector & 31));
2728}
2729#endif
2730
2731/*
2732 * send an IPI to a set of cpus.
2733 */
2734void
2735ipi_selected(u_int32_t cpus, u_int ipi)
2736{
2737
2738	CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi);
2739	selected_apic_ipi(cpus, ipi, APIC_DELMODE_FIXED);
2740}
2741
2742/*
2743 * send an IPI INTerrupt containing 'vector' to all CPUs, including myself
2744 */
2745void
2746ipi_all(u_int ipi)
2747{
2748
2749	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
2750	apic_ipi(APIC_DEST_ALLISELF, ipi, APIC_DELMODE_FIXED);
2751}
2752
2753/*
2754 * send an IPI to all CPUs EXCEPT myself
2755 */
2756void
2757ipi_all_but_self(u_int ipi)
2758{
2759
2760	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
2761	apic_ipi(APIC_DEST_ALLESELF, ipi, APIC_DELMODE_FIXED);
2762}
2763
2764/*
2765 * send an IPI to myself
2766 */
2767void
2768ipi_self(u_int ipi)
2769{
2770
2771	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
2772	apic_ipi(APIC_DEST_SELF, ipi, APIC_DELMODE_FIXED);
2773}
2774
2775static void
2776release_aps(void *dummy __unused)
2777{
2778
2779	if (mp_ncpus == 1)
2780		return;
2781	mtx_lock_spin(&sched_lock);
2782	atomic_store_rel_int(&aps_ready, 1);
2783	while (smp_started == 0)
2784		ia32_pause();
2785	mtx_unlock_spin(&sched_lock);
2786}
2787
2788SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
2789
2790static int
2791sysctl_hlt_cpus(SYSCTL_HANDLER_ARGS)
2792{
2793	u_int mask;
2794	int error;
2795
2796	mask = hlt_cpus_mask;
2797	error = sysctl_handle_int(oidp, &mask, 0, req);
2798	if (error || !req->newptr)
2799		return (error);
2800
2801	if (logical_cpus_mask != 0 &&
2802	    (mask & logical_cpus_mask) == logical_cpus_mask)
2803		hlt_logical_cpus = 1;
2804	else
2805		hlt_logical_cpus = 0;
2806
2807	if ((mask & all_cpus) == all_cpus)
2808		mask &= ~(1<<0);
2809	hlt_cpus_mask = mask;
2810	return (error);
2811}
2812SYSCTL_PROC(_machdep, OID_AUTO, hlt_cpus, CTLTYPE_INT|CTLFLAG_RW,
2813    0, 0, sysctl_hlt_cpus, "IU", "");
2814
2815static int
2816sysctl_hlt_logical_cpus(SYSCTL_HANDLER_ARGS)
2817{
2818	int disable, error;
2819
2820	disable = hlt_logical_cpus;
2821	error = sysctl_handle_int(oidp, &disable, 0, req);
2822	if (error || !req->newptr)
2823		return (error);
2824
2825	if (disable)
2826		hlt_cpus_mask |= logical_cpus_mask;
2827	else
2828		hlt_cpus_mask &= ~logical_cpus_mask;
2829
2830	if ((hlt_cpus_mask & all_cpus) == all_cpus)
2831		hlt_cpus_mask &= ~(1<<0);
2832
2833	hlt_logical_cpus = disable;
2834	return (error);
2835}
2836
2837static void
2838cpu_hlt_setup(void *dummy __unused)
2839{
2840
2841	if (logical_cpus_mask != 0) {
2842		TUNABLE_INT_FETCH("machdep.hlt_logical_cpus",
2843		    &hlt_logical_cpus);
2844		sysctl_ctx_init(&logical_cpu_clist);
2845		SYSCTL_ADD_PROC(&logical_cpu_clist,
2846		    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
2847		    "hlt_logical_cpus", CTLTYPE_INT|CTLFLAG_RW, 0, 0,
2848		    sysctl_hlt_logical_cpus, "IU", "");
2849		SYSCTL_ADD_UINT(&logical_cpu_clist,
2850		    SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO,
2851		    "logical_cpus_mask", CTLTYPE_INT|CTLFLAG_RD,
2852		    &logical_cpus_mask, 0, "");
2853
2854		if (hlt_logical_cpus)
2855			hlt_cpus_mask |= logical_cpus_mask;
2856	}
2857}
2858SYSINIT(cpu_hlt, SI_SUB_SMP, SI_ORDER_ANY, cpu_hlt_setup, NULL);
2859
2860int
2861mp_grab_cpu_hlt(void)
2862{
2863	u_int mask = PCPU_GET(cpumask);
2864	int retval;
2865
2866	retval = mask & hlt_cpus_mask;
2867	while (mask & hlt_cpus_mask)
2868		__asm __volatile("sti; hlt" : : : "memory");
2869	return (retval);
2870}
2871