mp_x86.c revision 74283
1/*
2 * Copyright (c) 1996, by Steve Passe
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. The name of the developer may NOT be used to endorse or promote products
11 *    derived from this software without specific prior written permission.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 * $FreeBSD: head/sys/i386/i386/mp_machdep.c 74283 2001-03-15 05:10:06Z peter $
26 */
27
28#include "opt_cpu.h"
29
30#ifdef SMP
31#include <machine/smptests.h>
32#else
33#error
34#endif
35
36#include <sys/param.h>
37#include <sys/bus.h>
38#include <sys/systm.h>
39#include <sys/kernel.h>
40#include <sys/proc.h>
41#include <sys/sysctl.h>
42#include <sys/malloc.h>
43#include <sys/memrange.h>
44#include <sys/mutex.h>
45#ifdef BETTER_CLOCK
46#include <sys/dkstat.h>
47#endif
48#include <sys/cons.h>	/* cngetc() */
49
50#include <vm/vm.h>
51#include <vm/vm_param.h>
52#include <vm/pmap.h>
53#include <vm/vm_kern.h>
54#include <vm/vm_extern.h>
55#ifdef BETTER_CLOCK
56#include <sys/lock.h>
57#include <vm/vm_map.h>
58#include <sys/user.h>
59#ifdef GPROF
60#include <sys/gmon.h>
61#endif
62#endif
63
64#include <machine/smp.h>
65#include <machine/apic.h>
66#include <machine/atomic.h>
67#include <machine/cpufunc.h>
68#include <machine/mpapic.h>
69#include <machine/psl.h>
70#include <machine/segments.h>
71#include <machine/smptests.h>	/** TEST_DEFAULT_CONFIG, TEST_TEST1 */
72#include <machine/tss.h>
73#include <machine/specialreg.h>
74#include <machine/globaldata.h>
75
76#if defined(APIC_IO)
77#include <machine/md_var.h>		/* setidt() */
78#include <i386/isa/icu.h>		/* IPIs */
79#include <i386/isa/intr_machdep.h>	/* IPIs */
80#endif	/* APIC_IO */
81
82#if defined(TEST_DEFAULT_CONFIG)
83#define MPFPS_MPFB1	TEST_DEFAULT_CONFIG
84#else
85#define MPFPS_MPFB1	mpfps->mpfb1
86#endif  /* TEST_DEFAULT_CONFIG */
87
88#define WARMBOOT_TARGET		0
89#define WARMBOOT_OFF		(KERNBASE + 0x0467)
90#define WARMBOOT_SEG		(KERNBASE + 0x0469)
91
92#ifdef PC98
93#define BIOS_BASE		(0xe8000)
94#define BIOS_SIZE		(0x18000)
95#else
96#define BIOS_BASE		(0xf0000)
97#define BIOS_SIZE		(0x10000)
98#endif
99#define BIOS_COUNT		(BIOS_SIZE/4)
100
101#define CMOS_REG		(0x70)
102#define CMOS_DATA		(0x71)
103#define BIOS_RESET		(0x0f)
104#define BIOS_WARM		(0x0a)
105
106#define PROCENTRY_FLAG_EN	0x01
107#define PROCENTRY_FLAG_BP	0x02
108#define IOAPICENTRY_FLAG_EN	0x01
109
110
111/* MP Floating Pointer Structure */
112typedef struct MPFPS {
113	char    signature[4];
114	void   *pap;
115	u_char  length;
116	u_char  spec_rev;
117	u_char  checksum;
118	u_char  mpfb1;
119	u_char  mpfb2;
120	u_char  mpfb3;
121	u_char  mpfb4;
122	u_char  mpfb5;
123}      *mpfps_t;
124
125/* MP Configuration Table Header */
126typedef struct MPCTH {
127	char    signature[4];
128	u_short base_table_length;
129	u_char  spec_rev;
130	u_char  checksum;
131	u_char  oem_id[8];
132	u_char  product_id[12];
133	void   *oem_table_pointer;
134	u_short oem_table_size;
135	u_short entry_count;
136	void   *apic_address;
137	u_short extended_table_length;
138	u_char  extended_table_checksum;
139	u_char  reserved;
140}      *mpcth_t;
141
142
143typedef struct PROCENTRY {
144	u_char  type;
145	u_char  apic_id;
146	u_char  apic_version;
147	u_char  cpu_flags;
148	u_long  cpu_signature;
149	u_long  feature_flags;
150	u_long  reserved1;
151	u_long  reserved2;
152}      *proc_entry_ptr;
153
154typedef struct BUSENTRY {
155	u_char  type;
156	u_char  bus_id;
157	char    bus_type[6];
158}      *bus_entry_ptr;
159
160typedef struct IOAPICENTRY {
161	u_char  type;
162	u_char  apic_id;
163	u_char  apic_version;
164	u_char  apic_flags;
165	void   *apic_address;
166}      *io_apic_entry_ptr;
167
168typedef struct INTENTRY {
169	u_char  type;
170	u_char  int_type;
171	u_short int_flags;
172	u_char  src_bus_id;
173	u_char  src_bus_irq;
174	u_char  dst_apic_id;
175	u_char  dst_apic_int;
176}      *int_entry_ptr;
177
178/* descriptions of MP basetable entries */
179typedef struct BASETABLE_ENTRY {
180	u_char  type;
181	u_char  length;
182	char    name[16];
183}       basetable_entry;
184
185/*
186 * this code MUST be enabled here and in mpboot.s.
187 * it follows the very early stages of AP boot by placing values in CMOS ram.
188 * it NORMALLY will never be needed and thus the primitive method for enabling.
189 *
190#define CHECK_POINTS
191 */
192
193#if defined(CHECK_POINTS) && !defined(PC98)
194#define CHECK_READ(A)	 (outb(CMOS_REG, (A)), inb(CMOS_DATA))
195#define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D)))
196
197#define CHECK_INIT(D);				\
198	CHECK_WRITE(0x34, (D));			\
199	CHECK_WRITE(0x35, (D));			\
200	CHECK_WRITE(0x36, (D));			\
201	CHECK_WRITE(0x37, (D));			\
202	CHECK_WRITE(0x38, (D));			\
203	CHECK_WRITE(0x39, (D));
204
205#define CHECK_PRINT(S);				\
206	printf("%s: %d, %d, %d, %d, %d, %d\n",	\
207	   (S),					\
208	   CHECK_READ(0x34),			\
209	   CHECK_READ(0x35),			\
210	   CHECK_READ(0x36),			\
211	   CHECK_READ(0x37),			\
212	   CHECK_READ(0x38),			\
213	   CHECK_READ(0x39));
214
215#else				/* CHECK_POINTS */
216
217#define CHECK_INIT(D)
218#define CHECK_PRINT(S)
219
220#endif				/* CHECK_POINTS */
221
222/*
223 * Values to send to the POST hardware.
224 */
225#define MP_BOOTADDRESS_POST	0x10
226#define MP_PROBE_POST		0x11
227#define MPTABLE_PASS1_POST	0x12
228
229#define MP_START_POST		0x13
230#define MP_ENABLE_POST		0x14
231#define MPTABLE_PASS2_POST	0x15
232
233#define START_ALL_APS_POST	0x16
234#define INSTALL_AP_TRAMP_POST	0x17
235#define START_AP_POST		0x18
236
237#define MP_ANNOUNCE_POST	0x19
238
239/* used to hold the AP's until we are ready to release them */
240struct mtx			ap_boot_mtx;
241
242/** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */
243int	current_postcode;
244
245/** XXX FIXME: what system files declare these??? */
246extern struct region_descriptor r_gdt, r_idt;
247
248int	bsp_apic_ready = 0;	/* flags useability of BSP apic */
249int	mp_ncpus;		/* # of CPUs, including BSP */
250int	mp_naps;		/* # of Applications processors */
251int	mp_nbusses;		/* # of busses */
252int	mp_napics;		/* # of IO APICs */
253int	boot_cpu_id;		/* designated BSP */
254vm_offset_t cpu_apic_address;
255vm_offset_t io_apic_address[NAPICID];	/* NAPICID is more than enough */
256extern	int nkpt;
257
258u_int32_t cpu_apic_versions[MAXCPU];
259u_int32_t *io_apic_versions;
260
261#ifdef APIC_INTR_REORDER
262struct {
263	volatile int *location;
264	int bit;
265} apic_isrbit_location[32];
266#endif
267
268struct apic_intmapinfo	int_to_apicintpin[APIC_INTMAPSIZE];
269
270/*
271 * APIC ID logical/physical mapping structures.
272 * We oversize these to simplify boot-time config.
273 */
274int     cpu_num_to_apic_id[NAPICID];
275int     io_num_to_apic_id[NAPICID];
276int     apic_id_to_logical[NAPICID];
277
278
279/* Bitmap of all available CPUs */
280u_int	all_cpus;
281
282/* AP uses this during bootstrap.  Do not staticize.  */
283char *bootSTK;
284static int bootAP;
285
286/* Hotwire a 0->4MB V==P mapping */
287extern pt_entry_t *KPTphys;
288
289/* SMP page table page */
290extern pt_entry_t *SMPpt;
291
292struct pcb stoppcbs[MAXCPU];
293
294int smp_started;		/* has the system started? */
295int smp_active = 0;		/* are the APs allowed to run? */
296SYSCTL_INT(_machdep, OID_AUTO, smp_active, CTLFLAG_RW, &smp_active, 0, "");
297
298/* XXX maybe should be hw.ncpu */
299static int smp_cpus = 1;	/* how many cpu's running */
300SYSCTL_INT(_machdep, OID_AUTO, smp_cpus, CTLFLAG_RD, &smp_cpus, 0, "");
301
302int invltlb_ok = 0;	/* throttle smp_invltlb() till safe */
303SYSCTL_INT(_machdep, OID_AUTO, invltlb_ok, CTLFLAG_RW, &invltlb_ok, 0, "");
304
305/* Enable forwarding of a signal to a process running on a different CPU */
306static int forward_signal_enabled = 1;
307SYSCTL_INT(_machdep, OID_AUTO, forward_signal_enabled, CTLFLAG_RW,
308	   &forward_signal_enabled, 0, "");
309
310/* Enable forwarding of roundrobin to all other cpus */
311static int forward_roundrobin_enabled = 1;
312SYSCTL_INT(_machdep, OID_AUTO, forward_roundrobin_enabled, CTLFLAG_RW,
313	   &forward_roundrobin_enabled, 0, "");
314
315
316/*
317 * Local data and functions.
318 */
319
320/* Set to 1 once we're ready to let the APs out of the pen. */
321static volatile int aps_ready = 0;
322
323static int	mp_capable;
324static u_int	boot_address;
325static u_int	base_memory;
326
327static int	picmode;		/* 0: virtual wire mode, 1: PIC mode */
328static mpfps_t	mpfps;
329static int	search_for_sig(u_int32_t target, int count);
330static void	mp_enable(u_int boot_addr);
331
332static void	mptable_pass1(void);
333static int	mptable_pass2(void);
334static void	default_mp_table(int type);
335static void	fix_mp_table(void);
336static void	setup_apic_irq_mapping(void);
337static void	init_locks(void);
338static int	start_all_aps(u_int boot_addr);
339static void	install_ap_tramp(u_int boot_addr);
340static int	start_ap(int logicalCpu, u_int boot_addr);
341void		ap_init(void);
342static int	apic_int_is_bus_type(int intr, int bus_type);
343static void	release_aps(void *dummy);
344
345/*
346 * initialize all the SMP locks
347 */
348
349/* critical region around IO APIC, apic_imen */
350struct mtx		imen_mtx;
351
352/* lock region used by kernel profiling */
353struct mtx		mcount_mtx;
354
355#ifdef USE_COMLOCK
356/* locks com (tty) data/hardware accesses: a FASTINTR() */
357struct mtx		com_mtx;
358#endif /* USE_COMLOCK */
359
360/* lock around the MP rendezvous */
361static struct mtx	smp_rv_mtx;
362
363/* only 1 CPU can panic at a time :) */
364struct mtx		panic_mtx;
365
366static void
367init_locks(void)
368{
369	/*
370	 * XXX The mcount mutex probably needs to be statically initialized,
371	 * since it will be used even in the function calls that get us to this
372	 * point.
373	 */
374	mtx_init(&mcount_mtx, "mcount", MTX_DEF);
375
376	mtx_init(&smp_rv_mtx, "smp rendezvous", MTX_SPIN);
377	mtx_init(&panic_mtx, "panic", MTX_DEF);
378
379#ifdef USE_COMLOCK
380	mtx_init(&com_mtx, "com", MTX_SPIN);
381#endif /* USE_COMLOCK */
382
383	mtx_init(&ap_boot_mtx, "ap boot", MTX_SPIN);
384}
385
386/*
387 * Calculate usable address in base memory for AP trampoline code.
388 */
389u_int
390mp_bootaddress(u_int basemem)
391{
392	POSTCODE(MP_BOOTADDRESS_POST);
393
394	base_memory = basemem * 1024;	/* convert to bytes */
395
396	boot_address = base_memory & ~0xfff;	/* round down to 4k boundary */
397	if ((base_memory - boot_address) < bootMP_size)
398		boot_address -= 4096;	/* not enough, lower by 4k */
399
400	return boot_address;
401}
402
403
404/*
405 * Look for an Intel MP spec table (ie, SMP capable hardware).
406 */
407int
408mp_probe(void)
409{
410	int     x;
411	u_long  segment;
412	u_int32_t target;
413
414	POSTCODE(MP_PROBE_POST);
415
416	/* see if EBDA exists */
417	if ((segment = (u_long) * (u_short *) (KERNBASE + 0x40e)) != 0) {
418		/* search first 1K of EBDA */
419		target = (u_int32_t) (segment << 4);
420		if ((x = search_for_sig(target, 1024 / 4)) >= 0)
421			goto found;
422	} else {
423		/* last 1K of base memory, effective 'top of base' passed in */
424		target = (u_int32_t) (base_memory - 0x400);
425		if ((x = search_for_sig(target, 1024 / 4)) >= 0)
426			goto found;
427	}
428
429	/* search the BIOS */
430	target = (u_int32_t) BIOS_BASE;
431	if ((x = search_for_sig(target, BIOS_COUNT)) >= 0)
432		goto found;
433
434	/* nothing found */
435	mpfps = (mpfps_t)0;
436	mp_capable = 0;
437	return 0;
438
439found:
440	/* calculate needed resources */
441	mpfps = (mpfps_t)x;
442	mptable_pass1();
443
444	/* flag fact that we are running multiple processors */
445	mp_capable = 1;
446	return 1;
447}
448
449
450/*
451 * Initialize the SMP hardware and the APIC and start up the AP's.
452 */
453void
454mp_start(void)
455{
456	POSTCODE(MP_START_POST);
457
458	/* look for MP capable motherboard */
459	if (mp_capable)
460		mp_enable(boot_address);
461	else
462		panic("MP hardware not found!");
463}
464
465
466/*
467 * Print various information about the SMP system hardware and setup.
468 */
469void
470mp_announce(void)
471{
472	int     x;
473
474	POSTCODE(MP_ANNOUNCE_POST);
475
476	printf("FreeBSD/SMP: Multiprocessor motherboard\n");
477	printf(" cpu0 (BSP): apic id: %2d", CPU_TO_ID(0));
478	printf(", version: 0x%08x", cpu_apic_versions[0]);
479	printf(", at 0x%08x\n", cpu_apic_address);
480	for (x = 1; x <= mp_naps; ++x) {
481		printf(" cpu%d (AP):  apic id: %2d", x, CPU_TO_ID(x));
482		printf(", version: 0x%08x", cpu_apic_versions[x]);
483		printf(", at 0x%08x\n", cpu_apic_address);
484	}
485
486#if defined(APIC_IO)
487	for (x = 0; x < mp_napics; ++x) {
488		printf(" io%d (APIC): apic id: %2d", x, IO_TO_ID(x));
489		printf(", version: 0x%08x", io_apic_versions[x]);
490		printf(", at 0x%08x\n", io_apic_address[x]);
491	}
492#else
493	printf(" Warning: APIC I/O disabled\n");
494#endif	/* APIC_IO */
495}
496
497/*
498 * AP cpu's call this to sync up protected mode.
499 */
500void
501init_secondary(void)
502{
503	int	gsel_tss;
504	int	x, myid = bootAP;
505
506	gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[myid];
507	gdt_segs[GPROC0_SEL].ssd_base =
508		(int) &SMP_prvspace[myid].globaldata.gd_common_tss;
509	SMP_prvspace[myid].globaldata.gd_prvspace =
510		&SMP_prvspace[myid].globaldata;
511
512	for (x = 0; x < NGDT; x++) {
513		ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd);
514	}
515
516	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
517	r_gdt.rd_base = (int) &gdt[myid * NGDT];
518	lgdt(&r_gdt);			/* does magic intra-segment return */
519
520	lidt(&r_idt);
521
522	lldt(_default_ldt);
523	PCPU_SET(currentldt, _default_ldt);
524
525	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
526	gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
527	PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */
528	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
529	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
530	PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd);
531	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
532	ltr(gsel_tss);
533
534	pmap_set_opt();
535}
536
537
538#if defined(APIC_IO)
539/*
540 * Final configuration of the BSP's local APIC:
541 *  - disable 'pic mode'.
542 *  - disable 'virtual wire mode'.
543 *  - enable NMI.
544 */
545void
546bsp_apic_configure(void)
547{
548	u_char		byte;
549	u_int32_t	temp;
550
551	/* leave 'pic mode' if necessary */
552	if (picmode) {
553		outb(0x22, 0x70);	/* select IMCR */
554		byte = inb(0x23);	/* current contents */
555		byte |= 0x01;		/* mask external INTR */
556		outb(0x23, byte);	/* disconnect 8259s/NMI */
557	}
558
559	/* mask lint0 (the 8259 'virtual wire' connection) */
560	temp = lapic.lvt_lint0;
561	temp |= APIC_LVT_M;		/* set the mask */
562	lapic.lvt_lint0 = temp;
563
564        /* setup lint1 to handle NMI */
565        temp = lapic.lvt_lint1;
566        temp &= ~APIC_LVT_M;		/* clear the mask */
567        lapic.lvt_lint1 = temp;
568
569	if (bootverbose)
570		apic_dump("bsp_apic_configure()");
571}
572#endif  /* APIC_IO */
573
574
575/*******************************************************************
576 * local functions and data
577 */
578
579/*
580 * start the SMP system
581 */
582static void
583mp_enable(u_int boot_addr)
584{
585	int     x;
586#if defined(APIC_IO)
587	int     apic;
588	u_int   ux;
589#endif	/* APIC_IO */
590
591	POSTCODE(MP_ENABLE_POST);
592
593	/* turn on 4MB of V == P addressing so we can get to MP table */
594	*(int *)PTD = PG_V | PG_RW | ((uintptr_t)(void *)KPTphys & PG_FRAME);
595	invltlb();
596
597	/* examine the MP table for needed info, uses physical addresses */
598	x = mptable_pass2();
599
600	*(int *)PTD = 0;
601	invltlb();
602
603	/* can't process default configs till the CPU APIC is pmapped */
604	if (x)
605		default_mp_table(x);
606
607	/* post scan cleanup */
608	fix_mp_table();
609	setup_apic_irq_mapping();
610
611#if defined(APIC_IO)
612
613	/* fill the LOGICAL io_apic_versions table */
614	for (apic = 0; apic < mp_napics; ++apic) {
615		ux = io_apic_read(apic, IOAPIC_VER);
616		io_apic_versions[apic] = ux;
617		io_apic_set_id(apic, IO_TO_ID(apic));
618	}
619
620	/* program each IO APIC in the system */
621	for (apic = 0; apic < mp_napics; ++apic)
622		if (io_apic_setup(apic) < 0)
623			panic("IO APIC setup failure");
624
625	/* install a 'Spurious INTerrupt' vector */
626	setidt(XSPURIOUSINT_OFFSET, Xspuriousint,
627	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
628
629	/* install an inter-CPU IPI for TLB invalidation */
630	setidt(XINVLTLB_OFFSET, Xinvltlb,
631	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
632
633#ifdef BETTER_CLOCK
634	/* install an inter-CPU IPI for reading processor state */
635	setidt(XCPUCHECKSTATE_OFFSET, Xcpucheckstate,
636	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
637#endif
638
639	/* install an inter-CPU IPI for all-CPU rendezvous */
640	setidt(XRENDEZVOUS_OFFSET, Xrendezvous,
641	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
642
643	/* install an inter-CPU IPI for forcing an additional software trap */
644	setidt(XCPUAST_OFFSET, Xcpuast,
645	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
646
647	/* install an inter-CPU IPI for CPU stop/restart */
648	setidt(XCPUSTOP_OFFSET, Xcpustop,
649	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
650
651#if defined(TEST_TEST1)
652	/* install a "fake hardware INTerrupt" vector */
653	setidt(XTEST1_OFFSET, Xtest1,
654	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
655#endif  /** TEST_TEST1 */
656
657#endif	/* APIC_IO */
658
659	/* initialize all SMP locks */
660	init_locks();
661
662	/* start each Application Processor */
663	start_all_aps(boot_addr);
664}
665
666
667/*
668 * look for the MP spec signature
669 */
670
671/* string defined by the Intel MP Spec as identifying the MP table */
672#define MP_SIG		0x5f504d5f	/* _MP_ */
673#define NEXT(X)		((X) += 4)
674static int
675search_for_sig(u_int32_t target, int count)
676{
677	int     x;
678	u_int32_t *addr = (u_int32_t *) (KERNBASE + target);
679
680	for (x = 0; x < count; NEXT(x))
681		if (addr[x] == MP_SIG)
682			/* make array index a byte index */
683			return (target + (x * sizeof(u_int32_t)));
684
685	return -1;
686}
687
688
689static basetable_entry basetable_entry_types[] =
690{
691	{0, 20, "Processor"},
692	{1, 8, "Bus"},
693	{2, 8, "I/O APIC"},
694	{3, 8, "I/O INT"},
695	{4, 8, "Local INT"}
696};
697
698typedef struct BUSDATA {
699	u_char  bus_id;
700	enum busTypes bus_type;
701}       bus_datum;
702
703typedef struct INTDATA {
704	u_char  int_type;
705	u_short int_flags;
706	u_char  src_bus_id;
707	u_char  src_bus_irq;
708	u_char  dst_apic_id;
709	u_char  dst_apic_int;
710	u_char	int_vector;
711}       io_int, local_int;
712
713typedef struct BUSTYPENAME {
714	u_char  type;
715	char    name[7];
716}       bus_type_name;
717
718static bus_type_name bus_type_table[] =
719{
720	{CBUS, "CBUS"},
721	{CBUSII, "CBUSII"},
722	{EISA, "EISA"},
723	{MCA, "MCA"},
724	{UNKNOWN_BUSTYPE, "---"},
725	{ISA, "ISA"},
726	{MCA, "MCA"},
727	{UNKNOWN_BUSTYPE, "---"},
728	{UNKNOWN_BUSTYPE, "---"},
729	{UNKNOWN_BUSTYPE, "---"},
730	{UNKNOWN_BUSTYPE, "---"},
731	{UNKNOWN_BUSTYPE, "---"},
732	{PCI, "PCI"},
733	{UNKNOWN_BUSTYPE, "---"},
734	{UNKNOWN_BUSTYPE, "---"},
735	{UNKNOWN_BUSTYPE, "---"},
736	{UNKNOWN_BUSTYPE, "---"},
737	{XPRESS, "XPRESS"},
738	{UNKNOWN_BUSTYPE, "---"}
739};
740/* from MP spec v1.4, table 5-1 */
741static int default_data[7][5] =
742{
743/*   nbus, id0, type0, id1, type1 */
744	{1, 0, ISA, 255, 255},
745	{1, 0, EISA, 255, 255},
746	{1, 0, EISA, 255, 255},
747	{1, 0, MCA, 255, 255},
748	{2, 0, ISA, 1, PCI},
749	{2, 0, EISA, 1, PCI},
750	{2, 0, MCA, 1, PCI}
751};
752
753
754/* the bus data */
755static bus_datum *bus_data;
756
757/* the IO INT data, one entry per possible APIC INTerrupt */
758static io_int  *io_apic_ints;
759
760static int nintrs;
761
762static int processor_entry	__P((proc_entry_ptr entry, int cpu));
763static int bus_entry		__P((bus_entry_ptr entry, int bus));
764static int io_apic_entry	__P((io_apic_entry_ptr entry, int apic));
765static int int_entry		__P((int_entry_ptr entry, int intr));
766static int lookup_bus_type	__P((char *name));
767
768
769/*
770 * 1st pass on motherboard's Intel MP specification table.
771 *
772 * initializes:
773 *	mp_ncpus = 1
774 *
775 * determines:
776 *	cpu_apic_address (common to all CPUs)
777 *	io_apic_address[N]
778 *	mp_naps
779 *	mp_nbusses
780 *	mp_napics
781 *	nintrs
782 */
783static void
784mptable_pass1(void)
785{
786	int	x;
787	mpcth_t	cth;
788	int	totalSize;
789	void*	position;
790	int	count;
791	int	type;
792
793	POSTCODE(MPTABLE_PASS1_POST);
794
795	/* clear various tables */
796	for (x = 0; x < NAPICID; ++x) {
797		io_apic_address[x] = ~0;	/* IO APIC address table */
798	}
799
800	/* init everything to empty */
801	mp_naps = 0;
802	mp_nbusses = 0;
803	mp_napics = 0;
804	nintrs = 0;
805
806	/* check for use of 'default' configuration */
807	if (MPFPS_MPFB1 != 0) {
808		/* use default addresses */
809		cpu_apic_address = DEFAULT_APIC_BASE;
810		io_apic_address[0] = DEFAULT_IO_APIC_BASE;
811
812		/* fill in with defaults */
813		mp_naps = 2;		/* includes BSP */
814		mp_nbusses = default_data[MPFPS_MPFB1 - 1][0];
815#if defined(APIC_IO)
816		mp_napics = 1;
817		nintrs = 16;
818#endif	/* APIC_IO */
819	}
820	else {
821		if ((cth = mpfps->pap) == 0)
822			panic("MP Configuration Table Header MISSING!");
823
824		cpu_apic_address = (vm_offset_t) cth->apic_address;
825
826		/* walk the table, recording info of interest */
827		totalSize = cth->base_table_length - sizeof(struct MPCTH);
828		position = (u_char *) cth + sizeof(struct MPCTH);
829		count = cth->entry_count;
830
831		while (count--) {
832			switch (type = *(u_char *) position) {
833			case 0: /* processor_entry */
834				if (((proc_entry_ptr)position)->cpu_flags
835					& PROCENTRY_FLAG_EN)
836					++mp_naps;
837				break;
838			case 1: /* bus_entry */
839				++mp_nbusses;
840				break;
841			case 2: /* io_apic_entry */
842				if (((io_apic_entry_ptr)position)->apic_flags
843					& IOAPICENTRY_FLAG_EN)
844					io_apic_address[mp_napics++] =
845					    (vm_offset_t)((io_apic_entry_ptr)
846						position)->apic_address;
847				break;
848			case 3: /* int_entry */
849				++nintrs;
850				break;
851			case 4:	/* int_entry */
852				break;
853			default:
854				panic("mpfps Base Table HOSED!");
855				/* NOTREACHED */
856			}
857
858			totalSize -= basetable_entry_types[type].length;
859			(u_char*)position += basetable_entry_types[type].length;
860		}
861	}
862
863	/* qualify the numbers */
864	if (mp_naps > MAXCPU) {
865		printf("Warning: only using %d of %d available CPUs!\n",
866			MAXCPU, mp_naps);
867		mp_naps = MAXCPU;
868	}
869
870	/*
871	 * Count the BSP.
872	 * This is also used as a counter while starting the APs.
873	 */
874	mp_ncpus = 1;
875
876	--mp_naps;	/* subtract the BSP */
877}
878
879
880/*
881 * 2nd pass on motherboard's Intel MP specification table.
882 *
883 * sets:
884 *	boot_cpu_id
885 *	ID_TO_IO(N), phy APIC ID to log CPU/IO table
886 *	CPU_TO_ID(N), logical CPU to APIC ID table
887 *	IO_TO_ID(N), logical IO to APIC ID table
888 *	bus_data[N]
889 *	io_apic_ints[N]
890 */
891static int
892mptable_pass2(void)
893{
894	int     x;
895	mpcth_t cth;
896	int     totalSize;
897	void*   position;
898	int     count;
899	int     type;
900	int     apic, bus, cpu, intr;
901	int	i, j;
902	int	pgeflag;
903
904	POSTCODE(MPTABLE_PASS2_POST);
905
906	pgeflag = 0;		/* XXX - Not used under SMP yet.  */
907
908	MALLOC(io_apic_versions, u_int32_t *, sizeof(u_int32_t) * mp_napics,
909	    M_DEVBUF, M_WAITOK);
910	MALLOC(ioapic, volatile ioapic_t **, sizeof(ioapic_t *) * mp_napics,
911	    M_DEVBUF, M_WAITOK);
912	MALLOC(io_apic_ints, io_int *, sizeof(io_int) * (nintrs + 1),
913	    M_DEVBUF, M_WAITOK);
914	MALLOC(bus_data, bus_datum *, sizeof(bus_datum) * mp_nbusses,
915	    M_DEVBUF, M_WAITOK);
916
917	bzero(ioapic, sizeof(ioapic_t *) * mp_napics);
918
919	for (i = 0; i < mp_napics; i++) {
920		for (j = 0; j < mp_napics; j++) {
921			/* same page frame as a previous IO apic? */
922			if (((vm_offset_t)SMPpt[NPTEPG-2-j] & PG_FRAME) ==
923			    (io_apic_address[i] & PG_FRAME)) {
924				ioapic[i] = (ioapic_t *)((u_int)SMP_prvspace
925					+ (NPTEPG-2-j) * PAGE_SIZE
926					+ (io_apic_address[i] & PAGE_MASK));
927				break;
928			}
929			/* use this slot if available */
930			if (((vm_offset_t)SMPpt[NPTEPG-2-j] & PG_FRAME) == 0) {
931				SMPpt[NPTEPG-2-j] = (pt_entry_t)(PG_V | PG_RW |
932				    pgeflag | (io_apic_address[i] & PG_FRAME));
933				ioapic[i] = (ioapic_t *)((u_int)SMP_prvspace
934					+ (NPTEPG-2-j) * PAGE_SIZE
935					+ (io_apic_address[i] & PAGE_MASK));
936				break;
937			}
938		}
939	}
940
941	/* clear various tables */
942	for (x = 0; x < NAPICID; ++x) {
943		ID_TO_IO(x) = -1;	/* phy APIC ID to log CPU/IO table */
944		CPU_TO_ID(x) = -1;	/* logical CPU to APIC ID table */
945		IO_TO_ID(x) = -1;	/* logical IO to APIC ID table */
946	}
947
948	/* clear bus data table */
949	for (x = 0; x < mp_nbusses; ++x)
950		bus_data[x].bus_id = 0xff;
951
952	/* clear IO APIC INT table */
953	for (x = 0; x < (nintrs + 1); ++x) {
954		io_apic_ints[x].int_type = 0xff;
955		io_apic_ints[x].int_vector = 0xff;
956	}
957
958	/* setup the cpu/apic mapping arrays */
959	boot_cpu_id = -1;
960
961	/* record whether PIC or virtual-wire mode */
962	picmode = (mpfps->mpfb2 & 0x80) ? 1 : 0;
963
964	/* check for use of 'default' configuration */
965	if (MPFPS_MPFB1 != 0)
966		return MPFPS_MPFB1;	/* return default configuration type */
967
968	if ((cth = mpfps->pap) == 0)
969		panic("MP Configuration Table Header MISSING!");
970
971	/* walk the table, recording info of interest */
972	totalSize = cth->base_table_length - sizeof(struct MPCTH);
973	position = (u_char *) cth + sizeof(struct MPCTH);
974	count = cth->entry_count;
975	apic = bus = intr = 0;
976	cpu = 1;				/* pre-count the BSP */
977
978	while (count--) {
979		switch (type = *(u_char *) position) {
980		case 0:
981			if (processor_entry(position, cpu))
982				++cpu;
983			break;
984		case 1:
985			if (bus_entry(position, bus))
986				++bus;
987			break;
988		case 2:
989			if (io_apic_entry(position, apic))
990				++apic;
991			break;
992		case 3:
993			if (int_entry(position, intr))
994				++intr;
995			break;
996		case 4:
997			/* int_entry(position); */
998			break;
999		default:
1000			panic("mpfps Base Table HOSED!");
1001			/* NOTREACHED */
1002		}
1003
1004		totalSize -= basetable_entry_types[type].length;
1005		(u_char *) position += basetable_entry_types[type].length;
1006	}
1007
1008	if (boot_cpu_id == -1)
1009		panic("NO BSP found!");
1010
1011	/* report fact that its NOT a default configuration */
1012	return 0;
1013}
1014
1015
1016void
1017assign_apic_irq(int apic, int intpin, int irq)
1018{
1019	int x;
1020
1021	if (int_to_apicintpin[irq].ioapic != -1)
1022		panic("assign_apic_irq: inconsistent table");
1023
1024	int_to_apicintpin[irq].ioapic = apic;
1025	int_to_apicintpin[irq].int_pin = intpin;
1026	int_to_apicintpin[irq].apic_address = ioapic[apic];
1027	int_to_apicintpin[irq].redirindex = IOAPIC_REDTBL + 2 * intpin;
1028
1029	for (x = 0; x < nintrs; x++) {
1030		if ((io_apic_ints[x].int_type == 0 ||
1031		     io_apic_ints[x].int_type == 3) &&
1032		    io_apic_ints[x].int_vector == 0xff &&
1033		    io_apic_ints[x].dst_apic_id == IO_TO_ID(apic) &&
1034		    io_apic_ints[x].dst_apic_int == intpin)
1035			io_apic_ints[x].int_vector = irq;
1036	}
1037}
1038
1039void
1040revoke_apic_irq(int irq)
1041{
1042	int x;
1043	int oldapic;
1044	int oldintpin;
1045
1046	if (int_to_apicintpin[irq].ioapic == -1)
1047		panic("assign_apic_irq: inconsistent table");
1048
1049	oldapic = int_to_apicintpin[irq].ioapic;
1050	oldintpin = int_to_apicintpin[irq].int_pin;
1051
1052	int_to_apicintpin[irq].ioapic = -1;
1053	int_to_apicintpin[irq].int_pin = 0;
1054	int_to_apicintpin[irq].apic_address = NULL;
1055	int_to_apicintpin[irq].redirindex = 0;
1056
1057	for (x = 0; x < nintrs; x++) {
1058		if ((io_apic_ints[x].int_type == 0 ||
1059		     io_apic_ints[x].int_type == 3) &&
1060		    io_apic_ints[x].int_vector == 0xff &&
1061		    io_apic_ints[x].dst_apic_id == IO_TO_ID(oldapic) &&
1062		    io_apic_ints[x].dst_apic_int == oldintpin)
1063			io_apic_ints[x].int_vector = 0xff;
1064	}
1065}
1066
1067
1068static void
1069allocate_apic_irq(int intr)
1070{
1071	int apic;
1072	int intpin;
1073	int irq;
1074
1075	if (io_apic_ints[intr].int_vector != 0xff)
1076		return;		/* Interrupt handler already assigned */
1077
1078	if (io_apic_ints[intr].int_type != 0 &&
1079	    (io_apic_ints[intr].int_type != 3 ||
1080	     (io_apic_ints[intr].dst_apic_id == IO_TO_ID(0) &&
1081	      io_apic_ints[intr].dst_apic_int == 0)))
1082		return;		/* Not INT or ExtInt on != (0, 0) */
1083
1084	irq = 0;
1085	while (irq < APIC_INTMAPSIZE &&
1086	       int_to_apicintpin[irq].ioapic != -1)
1087		irq++;
1088
1089	if (irq >= APIC_INTMAPSIZE)
1090		return;		/* No free interrupt handlers */
1091
1092	apic = ID_TO_IO(io_apic_ints[intr].dst_apic_id);
1093	intpin = io_apic_ints[intr].dst_apic_int;
1094
1095	assign_apic_irq(apic, intpin, irq);
1096	io_apic_setup_intpin(apic, intpin);
1097}
1098
1099
1100static void
1101swap_apic_id(int apic, int oldid, int newid)
1102{
1103	int x;
1104	int oapic;
1105
1106
1107	if (oldid == newid)
1108		return;			/* Nothing to do */
1109
1110	printf("Changing APIC ID for IO APIC #%d from %d to %d in MP table\n",
1111	       apic, oldid, newid);
1112
1113	/* Swap physical APIC IDs in interrupt entries */
1114	for (x = 0; x < nintrs; x++) {
1115		if (io_apic_ints[x].dst_apic_id == oldid)
1116			io_apic_ints[x].dst_apic_id = newid;
1117		else if (io_apic_ints[x].dst_apic_id == newid)
1118			io_apic_ints[x].dst_apic_id = oldid;
1119	}
1120
1121	/* Swap physical APIC IDs in IO_TO_ID mappings */
1122	for (oapic = 0; oapic < mp_napics; oapic++)
1123		if (IO_TO_ID(oapic) == newid)
1124			break;
1125
1126	if (oapic < mp_napics) {
1127		printf("Changing APIC ID for IO APIC #%d from "
1128		       "%d to %d in MP table\n",
1129		       oapic, newid, oldid);
1130		IO_TO_ID(oapic) = oldid;
1131	}
1132	IO_TO_ID(apic) = newid;
1133}
1134
1135
1136static void
1137fix_id_to_io_mapping(void)
1138{
1139	int x;
1140
1141	for (x = 0; x < NAPICID; x++)
1142		ID_TO_IO(x) = -1;
1143
1144	for (x = 0; x <= mp_naps; x++)
1145		if (CPU_TO_ID(x) < NAPICID)
1146			ID_TO_IO(CPU_TO_ID(x)) = x;
1147
1148	for (x = 0; x < mp_napics; x++)
1149		if (IO_TO_ID(x) < NAPICID)
1150			ID_TO_IO(IO_TO_ID(x)) = x;
1151}
1152
1153
1154static int
1155first_free_apic_id(void)
1156{
1157	int freeid, x;
1158
1159	for (freeid = 0; freeid < NAPICID; freeid++) {
1160		for (x = 0; x <= mp_naps; x++)
1161			if (CPU_TO_ID(x) == freeid)
1162				break;
1163		if (x <= mp_naps)
1164			continue;
1165		for (x = 0; x < mp_napics; x++)
1166			if (IO_TO_ID(x) == freeid)
1167				break;
1168		if (x < mp_napics)
1169			continue;
1170		return freeid;
1171	}
1172	return freeid;
1173}
1174
1175
1176static int
1177io_apic_id_acceptable(int apic, int id)
1178{
1179	int cpu;		/* Logical CPU number */
1180	int oapic;		/* Logical IO APIC number for other IO APIC */
1181
1182	if (id >= NAPICID)
1183		return 0;	/* Out of range */
1184
1185	for (cpu = 0; cpu <= mp_naps; cpu++)
1186		if (CPU_TO_ID(cpu) == id)
1187			return 0;	/* Conflict with CPU */
1188
1189	for (oapic = 0; oapic < mp_napics && oapic < apic; oapic++)
1190		if (IO_TO_ID(oapic) == id)
1191			return 0;	/* Conflict with other APIC */
1192
1193	return 1;		/* ID is acceptable for IO APIC */
1194}
1195
1196
1197/*
1198 * parse an Intel MP specification table
1199 */
1200static void
1201fix_mp_table(void)
1202{
1203	int	x;
1204	int	id;
1205	int	bus_0 = 0;	/* Stop GCC warning */
1206	int	bus_pci = 0;	/* Stop GCC warning */
1207	int	num_pci_bus;
1208	int	apic;		/* IO APIC unit number */
1209	int     freeid;		/* Free physical APIC ID */
1210	int	physid;		/* Current physical IO APIC ID */
1211
1212	/*
1213	 * Fix mis-numbering of the PCI bus and its INT entries if the BIOS
1214	 * did it wrong.  The MP spec says that when more than 1 PCI bus
1215	 * exists the BIOS must begin with bus entries for the PCI bus and use
1216	 * actual PCI bus numbering.  This implies that when only 1 PCI bus
1217	 * exists the BIOS can choose to ignore this ordering, and indeed many
1218	 * MP motherboards do ignore it.  This causes a problem when the PCI
1219	 * sub-system makes requests of the MP sub-system based on PCI bus
1220	 * numbers.	So here we look for the situation and renumber the
1221	 * busses and associated INTs in an effort to "make it right".
1222	 */
1223
1224	/* find bus 0, PCI bus, count the number of PCI busses */
1225	for (num_pci_bus = 0, x = 0; x < mp_nbusses; ++x) {
1226		if (bus_data[x].bus_id == 0) {
1227			bus_0 = x;
1228		}
1229		if (bus_data[x].bus_type == PCI) {
1230			++num_pci_bus;
1231			bus_pci = x;
1232		}
1233	}
1234	/*
1235	 * bus_0 == slot of bus with ID of 0
1236	 * bus_pci == slot of last PCI bus encountered
1237	 */
1238
1239	/* check the 1 PCI bus case for sanity */
1240	/* if it is number 0 all is well */
1241	if (num_pci_bus == 1 &&
1242	    bus_data[bus_pci].bus_id != 0) {
1243
1244		/* mis-numbered, swap with whichever bus uses slot 0 */
1245
1246		/* swap the bus entry types */
1247		bus_data[bus_pci].bus_type = bus_data[bus_0].bus_type;
1248		bus_data[bus_0].bus_type = PCI;
1249
1250		/* swap each relavant INTerrupt entry */
1251		id = bus_data[bus_pci].bus_id;
1252		for (x = 0; x < nintrs; ++x) {
1253			if (io_apic_ints[x].src_bus_id == id) {
1254				io_apic_ints[x].src_bus_id = 0;
1255			}
1256			else if (io_apic_ints[x].src_bus_id == 0) {
1257				io_apic_ints[x].src_bus_id = id;
1258			}
1259		}
1260	}
1261
1262	/* Assign IO APIC IDs.
1263	 *
1264	 * First try the existing ID. If a conflict is detected, try
1265	 * the ID in the MP table.  If a conflict is still detected, find
1266	 * a free id.
1267	 *
1268	 * We cannot use the ID_TO_IO table before all conflicts has been
1269	 * resolved and the table has been corrected.
1270	 */
1271	for (apic = 0; apic < mp_napics; ++apic) { /* For all IO APICs */
1272
1273		/* First try to use the value set by the BIOS */
1274		physid = io_apic_get_id(apic);
1275		if (io_apic_id_acceptable(apic, physid)) {
1276			if (IO_TO_ID(apic) != physid)
1277				swap_apic_id(apic, IO_TO_ID(apic), physid);
1278			continue;
1279		}
1280
1281		/* Then check if the value in the MP table is acceptable */
1282		if (io_apic_id_acceptable(apic, IO_TO_ID(apic)))
1283			continue;
1284
1285		/* Last resort, find a free APIC ID and use it */
1286		freeid = first_free_apic_id();
1287		if (freeid >= NAPICID)
1288			panic("No free physical APIC IDs found");
1289
1290		if (io_apic_id_acceptable(apic, freeid)) {
1291			swap_apic_id(apic, IO_TO_ID(apic), freeid);
1292			continue;
1293		}
1294		panic("Free physical APIC ID not usable");
1295	}
1296	fix_id_to_io_mapping();
1297
1298	/* detect and fix broken Compaq MP table */
1299	if (apic_int_type(0, 0) == -1) {
1300		printf("APIC_IO: MP table broken: 8259->APIC entry missing!\n");
1301		io_apic_ints[nintrs].int_type = 3;	/* ExtInt */
1302		io_apic_ints[nintrs].int_vector = 0xff;	/* Unassigned */
1303		/* XXX fixme, set src bus id etc, but it doesn't seem to hurt */
1304		io_apic_ints[nintrs].dst_apic_id = IO_TO_ID(0);
1305		io_apic_ints[nintrs].dst_apic_int = 0;	/* Pin 0 */
1306		nintrs++;
1307	}
1308}
1309
1310
1311/* Assign low level interrupt handlers */
1312static void
1313setup_apic_irq_mapping(void)
1314{
1315	int	x;
1316	int	int_vector;
1317
1318	/* Clear array */
1319	for (x = 0; x < APIC_INTMAPSIZE; x++) {
1320		int_to_apicintpin[x].ioapic = -1;
1321		int_to_apicintpin[x].int_pin = 0;
1322		int_to_apicintpin[x].apic_address = NULL;
1323		int_to_apicintpin[x].redirindex = 0;
1324	}
1325
1326	/* First assign ISA/EISA interrupts */
1327	for (x = 0; x < nintrs; x++) {
1328		int_vector = io_apic_ints[x].src_bus_irq;
1329		if (int_vector < APIC_INTMAPSIZE &&
1330		    io_apic_ints[x].int_vector == 0xff &&
1331		    int_to_apicintpin[int_vector].ioapic == -1 &&
1332		    (apic_int_is_bus_type(x, ISA) ||
1333		     apic_int_is_bus_type(x, EISA)) &&
1334		    io_apic_ints[x].int_type == 0) {
1335			assign_apic_irq(ID_TO_IO(io_apic_ints[x].dst_apic_id),
1336					io_apic_ints[x].dst_apic_int,
1337					int_vector);
1338		}
1339	}
1340
1341	/* Assign ExtInt entry if no ISA/EISA interrupt 0 entry */
1342	for (x = 0; x < nintrs; x++) {
1343		if (io_apic_ints[x].dst_apic_int == 0 &&
1344		    io_apic_ints[x].dst_apic_id == IO_TO_ID(0) &&
1345		    io_apic_ints[x].int_vector == 0xff &&
1346		    int_to_apicintpin[0].ioapic == -1 &&
1347		    io_apic_ints[x].int_type == 3) {
1348			assign_apic_irq(0, 0, 0);
1349			break;
1350		}
1351	}
1352	/* PCI interrupt assignment is deferred */
1353}
1354
1355
1356static int
1357processor_entry(proc_entry_ptr entry, int cpu)
1358{
1359	/* check for usability */
1360	if (!(entry->cpu_flags & PROCENTRY_FLAG_EN))
1361		return 0;
1362
1363	if(entry->apic_id >= NAPICID)
1364		panic("CPU APIC ID out of range (0..%d)", NAPICID - 1);
1365	/* check for BSP flag */
1366	if (entry->cpu_flags & PROCENTRY_FLAG_BP) {
1367		boot_cpu_id = entry->apic_id;
1368		CPU_TO_ID(0) = entry->apic_id;
1369		ID_TO_CPU(entry->apic_id) = 0;
1370		return 0;	/* its already been counted */
1371	}
1372
1373	/* add another AP to list, if less than max number of CPUs */
1374	else if (cpu < MAXCPU) {
1375		CPU_TO_ID(cpu) = entry->apic_id;
1376		ID_TO_CPU(entry->apic_id) = cpu;
1377		return 1;
1378	}
1379
1380	return 0;
1381}
1382
1383
1384static int
1385bus_entry(bus_entry_ptr entry, int bus)
1386{
1387	int     x;
1388	char    c, name[8];
1389
1390	/* encode the name into an index */
1391	for (x = 0; x < 6; ++x) {
1392		if ((c = entry->bus_type[x]) == ' ')
1393			break;
1394		name[x] = c;
1395	}
1396	name[x] = '\0';
1397
1398	if ((x = lookup_bus_type(name)) == UNKNOWN_BUSTYPE)
1399		panic("unknown bus type: '%s'", name);
1400
1401	bus_data[bus].bus_id = entry->bus_id;
1402	bus_data[bus].bus_type = x;
1403
1404	return 1;
1405}
1406
1407
1408static int
1409io_apic_entry(io_apic_entry_ptr entry, int apic)
1410{
1411	if (!(entry->apic_flags & IOAPICENTRY_FLAG_EN))
1412		return 0;
1413
1414	IO_TO_ID(apic) = entry->apic_id;
1415	if (entry->apic_id < NAPICID)
1416		ID_TO_IO(entry->apic_id) = apic;
1417
1418	return 1;
1419}
1420
1421
1422static int
1423lookup_bus_type(char *name)
1424{
1425	int     x;
1426
1427	for (x = 0; x < MAX_BUSTYPE; ++x)
1428		if (strcmp(bus_type_table[x].name, name) == 0)
1429			return bus_type_table[x].type;
1430
1431	return UNKNOWN_BUSTYPE;
1432}
1433
1434
1435static int
1436int_entry(int_entry_ptr entry, int intr)
1437{
1438	int apic;
1439
1440	io_apic_ints[intr].int_type = entry->int_type;
1441	io_apic_ints[intr].int_flags = entry->int_flags;
1442	io_apic_ints[intr].src_bus_id = entry->src_bus_id;
1443	io_apic_ints[intr].src_bus_irq = entry->src_bus_irq;
1444	if (entry->dst_apic_id == 255) {
1445		/* This signal goes to all IO APICS.  Select an IO APIC
1446		   with sufficient number of interrupt pins */
1447		for (apic = 0; apic < mp_napics; apic++)
1448			if (((io_apic_read(apic, IOAPIC_VER) &
1449			      IOART_VER_MAXREDIR) >> MAXREDIRSHIFT) >=
1450			    entry->dst_apic_int)
1451				break;
1452		if (apic < mp_napics)
1453			io_apic_ints[intr].dst_apic_id = IO_TO_ID(apic);
1454		else
1455			io_apic_ints[intr].dst_apic_id = entry->dst_apic_id;
1456	} else
1457		io_apic_ints[intr].dst_apic_id = entry->dst_apic_id;
1458	io_apic_ints[intr].dst_apic_int = entry->dst_apic_int;
1459
1460	return 1;
1461}
1462
1463
1464static int
1465apic_int_is_bus_type(int intr, int bus_type)
1466{
1467	int     bus;
1468
1469	for (bus = 0; bus < mp_nbusses; ++bus)
1470		if ((bus_data[bus].bus_id == io_apic_ints[intr].src_bus_id)
1471		    && ((int) bus_data[bus].bus_type == bus_type))
1472			return 1;
1473
1474	return 0;
1475}
1476
1477
1478/*
1479 * Given a traditional ISA INT mask, return an APIC mask.
1480 */
1481u_int
1482isa_apic_mask(u_int isa_mask)
1483{
1484	int isa_irq;
1485	int apic_pin;
1486
1487#if defined(SKIP_IRQ15_REDIRECT)
1488	if (isa_mask == (1 << 15)) {
1489		printf("skipping ISA IRQ15 redirect\n");
1490		return isa_mask;
1491	}
1492#endif  /* SKIP_IRQ15_REDIRECT */
1493
1494	isa_irq = ffs(isa_mask);		/* find its bit position */
1495	if (isa_irq == 0)			/* doesn't exist */
1496		return 0;
1497	--isa_irq;				/* make it zero based */
1498
1499	apic_pin = isa_apic_irq(isa_irq);	/* look for APIC connection */
1500	if (apic_pin == -1)
1501		return 0;
1502
1503	return (1 << apic_pin);			/* convert pin# to a mask */
1504}
1505
1506
1507/*
1508 * Determine which APIC pin an ISA/EISA INT is attached to.
1509 */
1510#define INTTYPE(I)	(io_apic_ints[(I)].int_type)
1511#define INTPIN(I)	(io_apic_ints[(I)].dst_apic_int)
1512#define INTIRQ(I)	(io_apic_ints[(I)].int_vector)
1513#define INTAPIC(I)	(ID_TO_IO(io_apic_ints[(I)].dst_apic_id))
1514
1515#define SRCBUSIRQ(I)	(io_apic_ints[(I)].src_bus_irq)
1516int
1517isa_apic_irq(int isa_irq)
1518{
1519	int     intr;
1520
1521	for (intr = 0; intr < nintrs; ++intr) {		/* check each record */
1522		if (INTTYPE(intr) == 0) {		/* standard INT */
1523			if (SRCBUSIRQ(intr) == isa_irq) {
1524				if (apic_int_is_bus_type(intr, ISA) ||
1525			            apic_int_is_bus_type(intr, EISA)) {
1526					if (INTIRQ(intr) == 0xff)
1527						return -1; /* unassigned */
1528					return INTIRQ(intr);	/* found */
1529				}
1530			}
1531		}
1532	}
1533	return -1;					/* NOT found */
1534}
1535
1536
1537/*
1538 * Determine which APIC pin a PCI INT is attached to.
1539 */
1540#define SRCBUSID(I)	(io_apic_ints[(I)].src_bus_id)
1541#define SRCBUSDEVICE(I)	((io_apic_ints[(I)].src_bus_irq >> 2) & 0x1f)
1542#define SRCBUSLINE(I)	(io_apic_ints[(I)].src_bus_irq & 0x03)
1543int
1544pci_apic_irq(int pciBus, int pciDevice, int pciInt)
1545{
1546	int     intr;
1547
1548	--pciInt;					/* zero based */
1549
1550	for (intr = 0; intr < nintrs; ++intr)		/* check each record */
1551		if ((INTTYPE(intr) == 0)		/* standard INT */
1552		    && (SRCBUSID(intr) == pciBus)
1553		    && (SRCBUSDEVICE(intr) == pciDevice)
1554		    && (SRCBUSLINE(intr) == pciInt))	/* a candidate IRQ */
1555			if (apic_int_is_bus_type(intr, PCI)) {
1556				if (INTIRQ(intr) == 0xff)
1557					allocate_apic_irq(intr);
1558				if (INTIRQ(intr) == 0xff)
1559					return -1;	/* unassigned */
1560				return INTIRQ(intr);	/* exact match */
1561			}
1562
1563	return -1;					/* NOT found */
1564}
1565
1566int
1567next_apic_irq(int irq)
1568{
1569	int intr, ointr;
1570	int bus, bustype;
1571
1572	bus = 0;
1573	bustype = 0;
1574	for (intr = 0; intr < nintrs; intr++) {
1575		if (INTIRQ(intr) != irq || INTTYPE(intr) != 0)
1576			continue;
1577		bus = SRCBUSID(intr);
1578		bustype = apic_bus_type(bus);
1579		if (bustype != ISA &&
1580		    bustype != EISA &&
1581		    bustype != PCI)
1582			continue;
1583		break;
1584	}
1585	if (intr >= nintrs) {
1586		return -1;
1587	}
1588	for (ointr = intr + 1; ointr < nintrs; ointr++) {
1589		if (INTTYPE(ointr) != 0)
1590			continue;
1591		if (bus != SRCBUSID(ointr))
1592			continue;
1593		if (bustype == PCI) {
1594			if (SRCBUSDEVICE(intr) != SRCBUSDEVICE(ointr))
1595				continue;
1596			if (SRCBUSLINE(intr) != SRCBUSLINE(ointr))
1597				continue;
1598		}
1599		if (bustype == ISA || bustype == EISA) {
1600			if (SRCBUSIRQ(intr) != SRCBUSIRQ(ointr))
1601				continue;
1602		}
1603		if (INTPIN(intr) == INTPIN(ointr))
1604			continue;
1605		break;
1606	}
1607	if (ointr >= nintrs) {
1608		return -1;
1609	}
1610	return INTIRQ(ointr);
1611}
1612#undef SRCBUSLINE
1613#undef SRCBUSDEVICE
1614#undef SRCBUSID
1615#undef SRCBUSIRQ
1616
1617#undef INTPIN
1618#undef INTIRQ
1619#undef INTAPIC
1620#undef INTTYPE
1621
1622
1623/*
1624 * Reprogram the MB chipset to NOT redirect an ISA INTerrupt.
1625 *
1626 * XXX FIXME:
1627 *  Exactly what this means is unclear at this point.  It is a solution
1628 *  for motherboards that redirect the MBIRQ0 pin.  Generically a motherboard
1629 *  could route any of the ISA INTs to upper (>15) IRQ values.  But most would
1630 *  NOT be redirected via MBIRQ0, thus "undirect()ing" them would NOT be an
1631 *  option.
1632 */
1633int
1634undirect_isa_irq(int rirq)
1635{
1636#if defined(READY)
1637	if (bootverbose)
1638	    printf("Freeing redirected ISA irq %d.\n", rirq);
1639	/** FIXME: tickle the MB redirector chip */
1640	return -1;
1641#else
1642	if (bootverbose)
1643	    printf("Freeing (NOT implemented) redirected ISA irq %d.\n", rirq);
1644	return 0;
1645#endif  /* READY */
1646}
1647
1648
1649/*
1650 * Reprogram the MB chipset to NOT redirect a PCI INTerrupt
1651 */
1652int
1653undirect_pci_irq(int rirq)
1654{
1655#if defined(READY)
1656	if (bootverbose)
1657		printf("Freeing redirected PCI irq %d.\n", rirq);
1658
1659	/** FIXME: tickle the MB redirector chip */
1660	return -1;
1661#else
1662	if (bootverbose)
1663		printf("Freeing (NOT implemented) redirected PCI irq %d.\n",
1664		       rirq);
1665	return 0;
1666#endif  /* READY */
1667}
1668
1669
1670/*
1671 * given a bus ID, return:
1672 *  the bus type if found
1673 *  -1 if NOT found
1674 */
1675int
1676apic_bus_type(int id)
1677{
1678	int     x;
1679
1680	for (x = 0; x < mp_nbusses; ++x)
1681		if (bus_data[x].bus_id == id)
1682			return bus_data[x].bus_type;
1683
1684	return -1;
1685}
1686
1687
1688/*
1689 * given a LOGICAL APIC# and pin#, return:
1690 *  the associated src bus ID if found
1691 *  -1 if NOT found
1692 */
1693int
1694apic_src_bus_id(int apic, int pin)
1695{
1696	int     x;
1697
1698	/* search each of the possible INTerrupt sources */
1699	for (x = 0; x < nintrs; ++x)
1700		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1701		    (pin == io_apic_ints[x].dst_apic_int))
1702			return (io_apic_ints[x].src_bus_id);
1703
1704	return -1;		/* NOT found */
1705}
1706
1707
1708/*
1709 * given a LOGICAL APIC# and pin#, return:
1710 *  the associated src bus IRQ if found
1711 *  -1 if NOT found
1712 */
1713int
1714apic_src_bus_irq(int apic, int pin)
1715{
1716	int     x;
1717
1718	for (x = 0; x < nintrs; x++)
1719		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1720		    (pin == io_apic_ints[x].dst_apic_int))
1721			return (io_apic_ints[x].src_bus_irq);
1722
1723	return -1;		/* NOT found */
1724}
1725
1726
1727/*
1728 * given a LOGICAL APIC# and pin#, return:
1729 *  the associated INTerrupt type if found
1730 *  -1 if NOT found
1731 */
1732int
1733apic_int_type(int apic, int pin)
1734{
1735	int     x;
1736
1737	/* search each of the possible INTerrupt sources */
1738	for (x = 0; x < nintrs; ++x)
1739		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1740		    (pin == io_apic_ints[x].dst_apic_int))
1741			return (io_apic_ints[x].int_type);
1742
1743	return -1;		/* NOT found */
1744}
1745
1746int
1747apic_irq(int apic, int pin)
1748{
1749	int x;
1750	int res;
1751
1752	for (x = 0; x < nintrs; ++x)
1753		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1754		    (pin == io_apic_ints[x].dst_apic_int)) {
1755			res = io_apic_ints[x].int_vector;
1756			if (res == 0xff)
1757				return -1;
1758			if (apic != int_to_apicintpin[res].ioapic)
1759				panic("apic_irq: inconsistent table");
1760			if (pin != int_to_apicintpin[res].int_pin)
1761				panic("apic_irq inconsistent table (2)");
1762			return res;
1763		}
1764	return -1;
1765}
1766
1767
1768/*
1769 * given a LOGICAL APIC# and pin#, return:
1770 *  the associated trigger mode if found
1771 *  -1 if NOT found
1772 */
1773int
1774apic_trigger(int apic, int pin)
1775{
1776	int     x;
1777
1778	/* search each of the possible INTerrupt sources */
1779	for (x = 0; x < nintrs; ++x)
1780		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1781		    (pin == io_apic_ints[x].dst_apic_int))
1782			return ((io_apic_ints[x].int_flags >> 2) & 0x03);
1783
1784	return -1;		/* NOT found */
1785}
1786
1787
1788/*
1789 * given a LOGICAL APIC# and pin#, return:
1790 *  the associated 'active' level if found
1791 *  -1 if NOT found
1792 */
1793int
1794apic_polarity(int apic, int pin)
1795{
1796	int     x;
1797
1798	/* search each of the possible INTerrupt sources */
1799	for (x = 0; x < nintrs; ++x)
1800		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1801		    (pin == io_apic_ints[x].dst_apic_int))
1802			return (io_apic_ints[x].int_flags & 0x03);
1803
1804	return -1;		/* NOT found */
1805}
1806
1807
1808/*
1809 * set data according to MP defaults
1810 * FIXME: probably not complete yet...
1811 */
1812static void
1813default_mp_table(int type)
1814{
1815	int     ap_cpu_id;
1816#if defined(APIC_IO)
1817	int     io_apic_id;
1818	int     pin;
1819#endif	/* APIC_IO */
1820
1821#if 0
1822	printf("  MP default config type: %d\n", type);
1823	switch (type) {
1824	case 1:
1825		printf("   bus: ISA, APIC: 82489DX\n");
1826		break;
1827	case 2:
1828		printf("   bus: EISA, APIC: 82489DX\n");
1829		break;
1830	case 3:
1831		printf("   bus: EISA, APIC: 82489DX\n");
1832		break;
1833	case 4:
1834		printf("   bus: MCA, APIC: 82489DX\n");
1835		break;
1836	case 5:
1837		printf("   bus: ISA+PCI, APIC: Integrated\n");
1838		break;
1839	case 6:
1840		printf("   bus: EISA+PCI, APIC: Integrated\n");
1841		break;
1842	case 7:
1843		printf("   bus: MCA+PCI, APIC: Integrated\n");
1844		break;
1845	default:
1846		printf("   future type\n");
1847		break;
1848		/* NOTREACHED */
1849	}
1850#endif	/* 0 */
1851
1852	boot_cpu_id = (lapic.id & APIC_ID_MASK) >> 24;
1853	ap_cpu_id = (boot_cpu_id == 0) ? 1 : 0;
1854
1855	/* BSP */
1856	CPU_TO_ID(0) = boot_cpu_id;
1857	ID_TO_CPU(boot_cpu_id) = 0;
1858
1859	/* one and only AP */
1860	CPU_TO_ID(1) = ap_cpu_id;
1861	ID_TO_CPU(ap_cpu_id) = 1;
1862
1863#if defined(APIC_IO)
1864	/* one and only IO APIC */
1865	io_apic_id = (io_apic_read(0, IOAPIC_ID) & APIC_ID_MASK) >> 24;
1866
1867	/*
1868	 * sanity check, refer to MP spec section 3.6.6, last paragraph
1869	 * necessary as some hardware isn't properly setting up the IO APIC
1870	 */
1871#if defined(REALLY_ANAL_IOAPICID_VALUE)
1872	if (io_apic_id != 2) {
1873#else
1874	if ((io_apic_id == 0) || (io_apic_id == 1) || (io_apic_id == 15)) {
1875#endif	/* REALLY_ANAL_IOAPICID_VALUE */
1876		io_apic_set_id(0, 2);
1877		io_apic_id = 2;
1878	}
1879	IO_TO_ID(0) = io_apic_id;
1880	ID_TO_IO(io_apic_id) = 0;
1881#endif	/* APIC_IO */
1882
1883	/* fill out bus entries */
1884	switch (type) {
1885	case 1:
1886	case 2:
1887	case 3:
1888	case 4:
1889	case 5:
1890	case 6:
1891	case 7:
1892		bus_data[0].bus_id = default_data[type - 1][1];
1893		bus_data[0].bus_type = default_data[type - 1][2];
1894		bus_data[1].bus_id = default_data[type - 1][3];
1895		bus_data[1].bus_type = default_data[type - 1][4];
1896		break;
1897
1898	/* case 4: case 7:		   MCA NOT supported */
1899	default:		/* illegal/reserved */
1900		panic("BAD default MP config: %d", type);
1901		/* NOTREACHED */
1902	}
1903
1904#if defined(APIC_IO)
1905	/* general cases from MP v1.4, table 5-2 */
1906	for (pin = 0; pin < 16; ++pin) {
1907		io_apic_ints[pin].int_type = 0;
1908		io_apic_ints[pin].int_flags = 0x05;	/* edge/active-hi */
1909		io_apic_ints[pin].src_bus_id = 0;
1910		io_apic_ints[pin].src_bus_irq = pin;	/* IRQ2 caught below */
1911		io_apic_ints[pin].dst_apic_id = io_apic_id;
1912		io_apic_ints[pin].dst_apic_int = pin;	/* 1-to-1 */
1913	}
1914
1915	/* special cases from MP v1.4, table 5-2 */
1916	if (type == 2) {
1917		io_apic_ints[2].int_type = 0xff;	/* N/C */
1918		io_apic_ints[13].int_type = 0xff;	/* N/C */
1919#if !defined(APIC_MIXED_MODE)
1920		/** FIXME: ??? */
1921		panic("sorry, can't support type 2 default yet");
1922#endif	/* APIC_MIXED_MODE */
1923	}
1924	else
1925		io_apic_ints[2].src_bus_irq = 0;	/* ISA IRQ0 is on APIC INT 2 */
1926
1927	if (type == 7)
1928		io_apic_ints[0].int_type = 0xff;	/* N/C */
1929	else
1930		io_apic_ints[0].int_type = 3;	/* vectored 8259 */
1931#endif	/* APIC_IO */
1932}
1933
1934
1935/*
1936 * start each AP in our list
1937 */
1938static int
1939start_all_aps(u_int boot_addr)
1940{
1941	int     x, i, pg;
1942	u_char  mpbiosreason;
1943	u_long  mpbioswarmvec;
1944	struct globaldata *gd;
1945	char *stack;
1946	uintptr_t kptbase;
1947
1948	POSTCODE(START_ALL_APS_POST);
1949
1950	/* initialize BSP's local APIC */
1951	apic_initialize();
1952	bsp_apic_ready = 1;
1953
1954	/* install the AP 1st level boot code */
1955	install_ap_tramp(boot_addr);
1956
1957
1958	/* save the current value of the warm-start vector */
1959	mpbioswarmvec = *((u_long *) WARMBOOT_OFF);
1960#ifndef PC98
1961	outb(CMOS_REG, BIOS_RESET);
1962	mpbiosreason = inb(CMOS_DATA);
1963#endif
1964
1965	/* record BSP in CPU map */
1966	all_cpus = 1;
1967
1968	/* set up temporary P==V mapping for AP boot */
1969	/* XXX this is a hack, we should boot the AP on its own stack/PTD */
1970	kptbase = (uintptr_t)(void *)KPTphys;
1971	for (x = 0; x < NKPT; x++)
1972		PTD[x] = (pd_entry_t)(PG_V | PG_RW |
1973		    ((kptbase + x * PAGE_SIZE) & PG_FRAME));
1974	invltlb();
1975
1976	/* start each AP */
1977	for (x = 1; x <= mp_naps; ++x) {
1978
1979		/* This is a bit verbose, it will go away soon.  */
1980
1981		/* first page of AP's private space */
1982		pg = x * i386_btop(sizeof(struct privatespace));
1983
1984		/* allocate a new private data page */
1985		gd = (struct globaldata *)kmem_alloc(kernel_map, PAGE_SIZE);
1986
1987		/* wire it into the private page table page */
1988		SMPpt[pg] = (pt_entry_t)(PG_V | PG_RW | vtophys(gd));
1989
1990		/* allocate and set up an idle stack data page */
1991		stack = (char *)kmem_alloc(kernel_map, UPAGES*PAGE_SIZE);
1992		for (i = 0; i < UPAGES; i++)
1993			SMPpt[pg + 1 + i] = (pt_entry_t)
1994			    (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack));
1995
1996		/* prime data page for it to use */
1997		SLIST_INSERT_HEAD(&cpuhead, gd, gd_allcpu);
1998		gd->gd_cpuid = x;
1999
2000		/* setup a vector to our boot code */
2001		*((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
2002		*((volatile u_short *) WARMBOOT_SEG) = (boot_addr >> 4);
2003#ifndef PC98
2004		outb(CMOS_REG, BIOS_RESET);
2005		outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
2006#endif
2007
2008		bootSTK = &SMP_prvspace[x].idlestack[UPAGES*PAGE_SIZE];
2009		bootAP = x;
2010
2011		/* attempt to start the Application Processor */
2012		CHECK_INIT(99);	/* setup checkpoints */
2013		if (!start_ap(x, boot_addr)) {
2014			printf("AP #%d (PHY# %d) failed!\n", x, CPU_TO_ID(x));
2015			CHECK_PRINT("trace");	/* show checkpoints */
2016			/* better panic as the AP may be running loose */
2017			printf("panic y/n? [y] ");
2018			if (cngetc() != 'n')
2019				panic("bye-bye");
2020		}
2021		CHECK_PRINT("trace");		/* show checkpoints */
2022
2023		/* record its version info */
2024		cpu_apic_versions[x] = cpu_apic_versions[0];
2025
2026		all_cpus |= (1 << x);		/* record AP in CPU map */
2027	}
2028
2029	/* build our map of 'other' CPUs */
2030	PCPU_SET(other_cpus, all_cpus & ~(1 << PCPU_GET(cpuid)));
2031
2032	/* fill in our (BSP) APIC version */
2033	cpu_apic_versions[0] = lapic.version;
2034
2035	/* restore the warmstart vector */
2036	*(u_long *) WARMBOOT_OFF = mpbioswarmvec;
2037#ifndef PC98
2038	outb(CMOS_REG, BIOS_RESET);
2039	outb(CMOS_DATA, mpbiosreason);
2040#endif
2041
2042	/*
2043	 * Set up the idle context for the BSP.  Similar to above except
2044	 * that some was done by locore, some by pmap.c and some is implicit
2045	 * because the BSP is cpu#0 and the page is initially zero, and also
2046	 * because we can refer to variables by name on the BSP..
2047	 */
2048
2049	/* Allocate and setup BSP idle stack */
2050	stack = (char *)kmem_alloc(kernel_map, UPAGES * PAGE_SIZE);
2051	for (i = 0; i < UPAGES; i++)
2052		SMPpt[1 + i] = (pt_entry_t)
2053		    (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack));
2054
2055	for (x = 0; x < NKPT; x++)
2056		PTD[x] = 0;
2057	pmap_set_opt();
2058
2059	/* number of APs actually started */
2060	return mp_ncpus - 1;
2061}
2062
2063
2064/*
2065 * load the 1st level AP boot code into base memory.
2066 */
2067
2068/* targets for relocation */
2069extern void bigJump(void);
2070extern void bootCodeSeg(void);
2071extern void bootDataSeg(void);
2072extern void MPentry(void);
2073extern u_int MP_GDT;
2074extern u_int mp_gdtbase;
2075
2076static void
2077install_ap_tramp(u_int boot_addr)
2078{
2079	int     x;
2080	int     size = *(int *) ((u_long) & bootMP_size);
2081	u_char *src = (u_char *) ((u_long) bootMP);
2082	u_char *dst = (u_char *) boot_addr + KERNBASE;
2083	u_int   boot_base = (u_int) bootMP;
2084	u_int8_t *dst8;
2085	u_int16_t *dst16;
2086	u_int32_t *dst32;
2087
2088	POSTCODE(INSTALL_AP_TRAMP_POST);
2089
2090	for (x = 0; x < size; ++x)
2091		*dst++ = *src++;
2092
2093	/*
2094	 * modify addresses in code we just moved to basemem. unfortunately we
2095	 * need fairly detailed info about mpboot.s for this to work.  changes
2096	 * to mpboot.s might require changes here.
2097	 */
2098
2099	/* boot code is located in KERNEL space */
2100	dst = (u_char *) boot_addr + KERNBASE;
2101
2102	/* modify the lgdt arg */
2103	dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base));
2104	*dst32 = boot_addr + ((u_int) & MP_GDT - boot_base);
2105
2106	/* modify the ljmp target for MPentry() */
2107	dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1);
2108	*dst32 = ((u_int) MPentry - KERNBASE);
2109
2110	/* modify the target for boot code segment */
2111	dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base));
2112	dst8 = (u_int8_t *) (dst16 + 1);
2113	*dst16 = (u_int) boot_addr & 0xffff;
2114	*dst8 = ((u_int) boot_addr >> 16) & 0xff;
2115
2116	/* modify the target for boot data segment */
2117	dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base));
2118	dst8 = (u_int8_t *) (dst16 + 1);
2119	*dst16 = (u_int) boot_addr & 0xffff;
2120	*dst8 = ((u_int) boot_addr >> 16) & 0xff;
2121}
2122
2123
2124/*
2125 * this function starts the AP (application processor) identified
2126 * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
2127 * to accomplish this.  This is necessary because of the nuances
2128 * of the different hardware we might encounter.  It ain't pretty,
2129 * but it seems to work.
2130 */
2131static int
2132start_ap(int logical_cpu, u_int boot_addr)
2133{
2134	int     physical_cpu;
2135	int     vector;
2136	int     cpus;
2137	u_long  icr_lo, icr_hi;
2138
2139	POSTCODE(START_AP_POST);
2140
2141	/* get the PHYSICAL APIC ID# */
2142	physical_cpu = CPU_TO_ID(logical_cpu);
2143
2144	/* calculate the vector */
2145	vector = (boot_addr >> 12) & 0xff;
2146
2147	/* used as a watchpoint to signal AP startup */
2148	cpus = mp_ncpus;
2149
2150	/*
2151	 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
2152	 * and running the target CPU. OR this INIT IPI might be latched (P5
2153	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
2154	 * ignored.
2155	 */
2156
2157	/* setup the address for the target AP */
2158	icr_hi = lapic.icr_hi & ~APIC_ID_MASK;
2159	icr_hi |= (physical_cpu << 24);
2160	lapic.icr_hi = icr_hi;
2161
2162	/* do an INIT IPI: assert RESET */
2163	icr_lo = lapic.icr_lo & 0xfff00000;
2164	lapic.icr_lo = icr_lo | 0x0000c500;
2165
2166	/* wait for pending status end */
2167	while (lapic.icr_lo & APIC_DELSTAT_MASK)
2168		 /* spin */ ;
2169
2170	/* do an INIT IPI: deassert RESET */
2171	lapic.icr_lo = icr_lo | 0x00008500;
2172
2173	/* wait for pending status end */
2174	u_sleep(10000);		/* wait ~10mS */
2175	while (lapic.icr_lo & APIC_DELSTAT_MASK)
2176		 /* spin */ ;
2177
2178	/*
2179	 * next we do a STARTUP IPI: the previous INIT IPI might still be
2180	 * latched, (P5 bug) this 1st STARTUP would then terminate
2181	 * immediately, and the previously started INIT IPI would continue. OR
2182	 * the previous INIT IPI has already run. and this STARTUP IPI will
2183	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
2184	 * will run.
2185	 */
2186
2187	/* do a STARTUP IPI */
2188	lapic.icr_lo = icr_lo | 0x00000600 | vector;
2189	while (lapic.icr_lo & APIC_DELSTAT_MASK)
2190		 /* spin */ ;
2191	u_sleep(200);		/* wait ~200uS */
2192
2193	/*
2194	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
2195	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
2196	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
2197	 * recognized after hardware RESET or INIT IPI.
2198	 */
2199
2200	lapic.icr_lo = icr_lo | 0x00000600 | vector;
2201	while (lapic.icr_lo & APIC_DELSTAT_MASK)
2202		 /* spin */ ;
2203	u_sleep(200);		/* wait ~200uS */
2204
2205	/* wait for it to start */
2206	set_apic_timer(5000000);/* == 5 seconds */
2207	while (read_apic_timer())
2208		if (mp_ncpus > cpus)
2209			return 1;	/* return SUCCESS */
2210
2211	return 0;		/* return FAILURE */
2212}
2213
2214/*
2215 * Flush the TLB on all other CPU's
2216 *
2217 * XXX: Needs to handshake and wait for completion before proceding.
2218 */
2219void
2220smp_invltlb(void)
2221{
2222#if defined(APIC_IO)
2223	if (smp_started && invltlb_ok)
2224		all_but_self_ipi(XINVLTLB_OFFSET);
2225#endif  /* APIC_IO */
2226}
2227
2228void
2229invlpg(u_int addr)
2230{
2231	__asm   __volatile("invlpg (%0)"::"r"(addr):"memory");
2232
2233	/* send a message to the other CPUs */
2234	smp_invltlb();
2235}
2236
2237void
2238invltlb(void)
2239{
2240	u_long  temp;
2241
2242	/*
2243	 * This should be implemented as load_cr3(rcr3()) when load_cr3() is
2244	 * inlined.
2245	 */
2246	__asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory");
2247
2248	/* send a message to the other CPUs */
2249	smp_invltlb();
2250}
2251
2252
2253/*
2254 * This is called once the rest of the system is up and running and we're
2255 * ready to let the AP's out of the pen.
2256 */
2257void
2258ap_init(void)
2259{
2260	u_int	apic_id;
2261
2262	/* spin until all the AP's are ready */
2263	while (!aps_ready)
2264		/* spin */ ;
2265
2266	/*
2267	 * Set curproc to our per-cpu idleproc so that mutexes have
2268	 * something unique to lock with.
2269	 */
2270	PCPU_SET(curproc, PCPU_GET(idleproc));
2271
2272	/* lock against other AP's that are waking up */
2273	mtx_lock_spin(&ap_boot_mtx);
2274
2275	/* BSP may have changed PTD while we're waiting for the lock */
2276	cpu_invltlb();
2277
2278	smp_cpus++;
2279
2280#if defined(I586_CPU) && !defined(NO_F00F_HACK)
2281	lidt(&r_idt);
2282#endif
2283
2284	/* Build our map of 'other' CPUs. */
2285	PCPU_SET(other_cpus, all_cpus & ~(1 << PCPU_GET(cpuid)));
2286
2287	printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
2288
2289	/* set up CPU registers and state */
2290	cpu_setregs();
2291
2292	/* set up FPU state on the AP */
2293	npxinit(__INITIAL_NPXCW__);
2294
2295	/* A quick check from sanity claus */
2296	apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]);
2297	if (PCPU_GET(cpuid) != apic_id) {
2298		printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
2299		printf("SMP: apic_id = %d\n", apic_id);
2300		printf("PTD[MPPTDI] = %p\n", (void *)PTD[MPPTDI]);
2301		panic("cpuid mismatch! boom!!");
2302	}
2303
2304	/* Init local apic for irq's */
2305	apic_initialize();
2306
2307	/* Set memory range attributes for this CPU to match the BSP */
2308	mem_range_AP_init();
2309
2310	/*
2311	 * Activate smp_invltlb, although strictly speaking, this isn't
2312	 * quite correct yet.  We should have a bitfield for cpus willing
2313	 * to accept TLB flush IPI's or something and sync them.
2314	 */
2315	if (smp_cpus == mp_ncpus) {
2316		invltlb_ok = 1;
2317		smp_started = 1; /* enable IPI's, tlb shootdown, freezes etc */
2318		smp_active = 1;	 /* historic */
2319	}
2320
2321	/* let other AP's wake up now */
2322	mtx_unlock_spin(&ap_boot_mtx);
2323
2324	/* wait until all the AP's are up */
2325	while (smp_started == 0)
2326		; /* nothing */
2327
2328	microuptime(PCPU_PTR(switchtime));
2329	PCPU_SET(switchticks, ticks);
2330
2331	/* ok, now grab sched_lock and enter the scheduler */
2332	enable_intr();
2333	mtx_lock_spin(&sched_lock);
2334	cpu_throw();	/* doesn't return */
2335
2336	panic("scheduler returned us to ap_init");
2337}
2338
2339#ifdef BETTER_CLOCK
2340
2341#define CHECKSTATE_USER	0
2342#define CHECKSTATE_SYS	1
2343#define CHECKSTATE_INTR	2
2344
2345/* Do not staticize.  Used from apic_vector.s */
2346struct proc*	checkstate_curproc[MAXCPU];
2347int		checkstate_cpustate[MAXCPU];
2348u_long		checkstate_pc[MAXCPU];
2349
2350#define PC_TO_INDEX(pc, prof)				\
2351        ((int)(((u_quad_t)((pc) - (prof)->pr_off) *	\
2352            (u_quad_t)((prof)->pr_scale)) >> 16) & ~1)
2353
2354static void
2355addupc_intr_forwarded(struct proc *p, int id, int *astmap)
2356{
2357	int i;
2358	struct uprof *prof;
2359	u_long pc;
2360
2361	pc = checkstate_pc[id];
2362	prof = &p->p_stats->p_prof;
2363	if (pc >= prof->pr_off &&
2364	    (i = PC_TO_INDEX(pc, prof)) < prof->pr_size) {
2365		mtx_assert(&sched_lock, MA_OWNED);
2366		if ((p->p_sflag & PS_OWEUPC) == 0) {
2367			prof->pr_addr = pc;
2368			prof->pr_ticks = 1;
2369			p->p_sflag |= PS_OWEUPC;
2370		}
2371		*astmap |= (1 << id);
2372	}
2373}
2374
2375static void
2376forwarded_statclock(int id, int pscnt, int *astmap)
2377{
2378	struct pstats *pstats;
2379	long rss;
2380	struct rusage *ru;
2381	struct vmspace *vm;
2382	int cpustate;
2383	struct proc *p;
2384#ifdef GPROF
2385	register struct gmonparam *g;
2386	int i;
2387#endif
2388
2389	mtx_assert(&sched_lock, MA_OWNED);
2390	p = checkstate_curproc[id];
2391	cpustate = checkstate_cpustate[id];
2392
2393	/* XXX */
2394	if (p->p_ithd)
2395		cpustate = CHECKSTATE_INTR;
2396	else if (p == SMP_prvspace[id].globaldata.gd_idleproc)
2397		cpustate = CHECKSTATE_SYS;
2398
2399	switch (cpustate) {
2400	case CHECKSTATE_USER:
2401		if (p->p_sflag & PS_PROFIL)
2402			addupc_intr_forwarded(p, id, astmap);
2403		if (pscnt > 1)
2404			return;
2405		p->p_uticks++;
2406		if (p->p_nice > NZERO)
2407			cp_time[CP_NICE]++;
2408		else
2409			cp_time[CP_USER]++;
2410		break;
2411	case CHECKSTATE_SYS:
2412#ifdef GPROF
2413		/*
2414		 * Kernel statistics are just like addupc_intr, only easier.
2415		 */
2416		g = &_gmonparam;
2417		if (g->state == GMON_PROF_ON) {
2418			i = checkstate_pc[id] - g->lowpc;
2419			if (i < g->textsize) {
2420				i /= HISTFRACTION * sizeof(*g->kcount);
2421				g->kcount[i]++;
2422			}
2423		}
2424#endif
2425		if (pscnt > 1)
2426			return;
2427
2428		p->p_sticks++;
2429		if (p == SMP_prvspace[id].globaldata.gd_idleproc)
2430			cp_time[CP_IDLE]++;
2431		else
2432			cp_time[CP_SYS]++;
2433		break;
2434	case CHECKSTATE_INTR:
2435	default:
2436#ifdef GPROF
2437		/*
2438		 * Kernel statistics are just like addupc_intr, only easier.
2439		 */
2440		g = &_gmonparam;
2441		if (g->state == GMON_PROF_ON) {
2442			i = checkstate_pc[id] - g->lowpc;
2443			if (i < g->textsize) {
2444				i /= HISTFRACTION * sizeof(*g->kcount);
2445				g->kcount[i]++;
2446			}
2447		}
2448#endif
2449		if (pscnt > 1)
2450			return;
2451		KASSERT(p != NULL, ("NULL process in interrupt state"));
2452		p->p_iticks++;
2453		cp_time[CP_INTR]++;
2454	}
2455
2456	schedclock(p);
2457
2458	/* Update resource usage integrals and maximums. */
2459	if ((pstats = p->p_stats) != NULL &&
2460	    (ru = &pstats->p_ru) != NULL &&
2461	    (vm = p->p_vmspace) != NULL) {
2462		ru->ru_ixrss += pgtok(vm->vm_tsize);
2463		ru->ru_idrss += pgtok(vm->vm_dsize);
2464		ru->ru_isrss += pgtok(vm->vm_ssize);
2465		rss = pgtok(vmspace_resident_count(vm));
2466		if (ru->ru_maxrss < rss)
2467			ru->ru_maxrss = rss;
2468	}
2469}
2470
2471void
2472forward_statclock(int pscnt)
2473{
2474	int map;
2475	int id;
2476	int i;
2477
2478	/* Kludge. We don't yet have separate locks for the interrupts
2479	 * and the kernel. This means that we cannot let the other processors
2480	 * handle complex interrupts while inhibiting them from entering
2481	 * the kernel in a non-interrupt context.
2482	 *
2483	 * What we can do, without changing the locking mechanisms yet,
2484	 * is letting the other processors handle a very simple interrupt
2485	 * (wich determines the processor states), and do the main
2486	 * work ourself.
2487	 */
2488
2489	CTR1(KTR_SMP, "forward_statclock(%d)", pscnt);
2490
2491	if (!smp_started || !invltlb_ok || cold || panicstr)
2492		return;
2493
2494	/* Step 1: Probe state   (user, cpu, interrupt, spinlock, idle ) */
2495
2496	map = PCPU_GET(other_cpus) & ~stopped_cpus ;
2497	checkstate_probed_cpus = 0;
2498	if (map != 0)
2499		selected_apic_ipi(map,
2500				  XCPUCHECKSTATE_OFFSET, APIC_DELMODE_FIXED);
2501
2502	i = 0;
2503	while (checkstate_probed_cpus != map) {
2504		/* spin */
2505		i++;
2506		if (i == 100000) {
2507#ifdef BETTER_CLOCK_DIAGNOSTIC
2508			printf("forward_statclock: checkstate %x\n",
2509			       checkstate_probed_cpus);
2510#endif
2511			break;
2512		}
2513	}
2514
2515	/*
2516	 * Step 2: walk through other processors processes, update ticks and
2517	 * profiling info.
2518	 */
2519
2520	map = 0;
2521	for (id = 0; id < mp_ncpus; id++) {
2522		if (id == PCPU_GET(cpuid))
2523			continue;
2524		if (((1 << id) & checkstate_probed_cpus) == 0)
2525			continue;
2526		forwarded_statclock(id, pscnt, &map);
2527	}
2528	if (map != 0) {
2529		checkstate_need_ast |= map;
2530		selected_apic_ipi(map, XCPUAST_OFFSET, APIC_DELMODE_FIXED);
2531		i = 0;
2532		while ((checkstate_need_ast & map) != 0) {
2533			/* spin */
2534			i++;
2535			if (i > 100000) {
2536#ifdef BETTER_CLOCK_DIAGNOSTIC
2537				printf("forward_statclock: dropped ast 0x%x\n",
2538				       checkstate_need_ast & map);
2539#endif
2540				break;
2541			}
2542		}
2543	}
2544}
2545
2546void
2547forward_hardclock(int pscnt)
2548{
2549	int map;
2550	int id;
2551	struct proc *p;
2552	struct pstats *pstats;
2553	int i;
2554
2555	/* Kludge. We don't yet have separate locks for the interrupts
2556	 * and the kernel. This means that we cannot let the other processors
2557	 * handle complex interrupts while inhibiting them from entering
2558	 * the kernel in a non-interrupt context.
2559	 *
2560	 * What we can do, without changing the locking mechanisms yet,
2561	 * is letting the other processors handle a very simple interrupt
2562	 * (wich determines the processor states), and do the main
2563	 * work ourself.
2564	 */
2565
2566	CTR1(KTR_SMP, "forward_hardclock(%d)", pscnt);
2567
2568	if (!smp_started || !invltlb_ok || cold || panicstr)
2569		return;
2570
2571	/* Step 1: Probe state   (user, cpu, interrupt, spinlock, idle) */
2572
2573	map = PCPU_GET(other_cpus) & ~stopped_cpus ;
2574	checkstate_probed_cpus = 0;
2575	if (map != 0)
2576		selected_apic_ipi(map,
2577				  XCPUCHECKSTATE_OFFSET, APIC_DELMODE_FIXED);
2578
2579	i = 0;
2580	while (checkstate_probed_cpus != map) {
2581		/* spin */
2582		i++;
2583		if (i == 100000) {
2584#ifdef BETTER_CLOCK_DIAGNOSTIC
2585			printf("forward_hardclock: checkstate %x\n",
2586			       checkstate_probed_cpus);
2587#endif
2588			break;
2589		}
2590	}
2591
2592	/*
2593	 * Step 2: walk through other processors processes, update virtual
2594	 * timer and profiling timer. If stathz == 0, also update ticks and
2595	 * profiling info.
2596	 */
2597
2598	map = 0;
2599	for (id = 0; id < mp_ncpus; id++) {
2600		if (id == PCPU_GET(cpuid))
2601			continue;
2602		if (((1 << id) & checkstate_probed_cpus) == 0)
2603			continue;
2604		p = checkstate_curproc[id];
2605		if (p) {
2606			pstats = p->p_stats;
2607			if (checkstate_cpustate[id] == CHECKSTATE_USER &&
2608			    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
2609			    itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) {
2610				p->p_sflag |= PS_ALRMPEND;
2611				map |= (1 << id);
2612			}
2613			if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
2614			    itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) {
2615				p->p_sflag |= PS_PROFPEND;
2616				map |= (1 << id);
2617			}
2618		}
2619		if (stathz == 0) {
2620			forwarded_statclock( id, pscnt, &map);
2621		}
2622	}
2623	if (map != 0) {
2624		checkstate_need_ast |= map;
2625		selected_apic_ipi(map, XCPUAST_OFFSET, APIC_DELMODE_FIXED);
2626		i = 0;
2627		while ((checkstate_need_ast & map) != 0) {
2628			/* spin */
2629			i++;
2630			if (i > 100000) {
2631#ifdef BETTER_CLOCK_DIAGNOSTIC
2632				printf("forward_hardclock: dropped ast 0x%x\n",
2633				       checkstate_need_ast & map);
2634#endif
2635				break;
2636			}
2637		}
2638	}
2639}
2640
2641#endif /* BETTER_CLOCK */
2642
2643void
2644forward_signal(struct proc *p)
2645{
2646	int map;
2647	int id;
2648	int i;
2649
2650	/* Kludge. We don't yet have separate locks for the interrupts
2651	 * and the kernel. This means that we cannot let the other processors
2652	 * handle complex interrupts while inhibiting them from entering
2653	 * the kernel in a non-interrupt context.
2654	 *
2655	 * What we can do, without changing the locking mechanisms yet,
2656	 * is letting the other processors handle a very simple interrupt
2657	 * (wich determines the processor states), and do the main
2658	 * work ourself.
2659	 */
2660
2661	CTR1(KTR_SMP, "forward_signal(%p)", p);
2662
2663	if (!smp_started || !invltlb_ok || cold || panicstr)
2664		return;
2665	if (!forward_signal_enabled)
2666		return;
2667	mtx_lock_spin(&sched_lock);
2668	while (1) {
2669		if (p->p_stat != SRUN) {
2670			mtx_unlock_spin(&sched_lock);
2671			return;
2672		}
2673		id = p->p_oncpu;
2674		mtx_unlock_spin(&sched_lock);
2675		if (id == 0xff)
2676			return;
2677		map = (1<<id);
2678		checkstate_need_ast |= map;
2679		selected_apic_ipi(map, XCPUAST_OFFSET, APIC_DELMODE_FIXED);
2680		i = 0;
2681		while ((checkstate_need_ast & map) != 0) {
2682			/* spin */
2683			i++;
2684			if (i > 100000) {
2685#if 0
2686				printf("forward_signal: dropped ast 0x%x\n",
2687				       checkstate_need_ast & map);
2688#endif
2689				break;
2690			}
2691		}
2692		mtx_lock_spin(&sched_lock);
2693		if (id == p->p_oncpu) {
2694			mtx_unlock_spin(&sched_lock);
2695			return;
2696		}
2697	}
2698}
2699
2700void
2701forward_roundrobin(void)
2702{
2703	u_int map;
2704	int i;
2705
2706	CTR0(KTR_SMP, "forward_roundrobin()");
2707
2708	if (!smp_started || !invltlb_ok || cold || panicstr)
2709		return;
2710	if (!forward_roundrobin_enabled)
2711		return;
2712	resched_cpus |= PCPU_GET(other_cpus);
2713	map = PCPU_GET(other_cpus) & ~stopped_cpus ;
2714#if 1
2715	selected_apic_ipi(map, XCPUAST_OFFSET, APIC_DELMODE_FIXED);
2716#else
2717	(void) all_but_self_ipi(XCPUAST_OFFSET);
2718#endif
2719	i = 0;
2720	while ((checkstate_need_ast & map) != 0) {
2721		/* spin */
2722		i++;
2723		if (i > 100000) {
2724#if 0
2725			printf("forward_roundrobin: dropped ast 0x%x\n",
2726			       checkstate_need_ast & map);
2727#endif
2728			break;
2729		}
2730	}
2731}
2732
2733/*
2734 * When called the executing CPU will send an IPI to all other CPUs
2735 *  requesting that they halt execution.
2736 *
2737 * Usually (but not necessarily) called with 'other_cpus' as its arg.
2738 *
2739 *  - Signals all CPUs in map to stop.
2740 *  - Waits for each to stop.
2741 *
2742 * Returns:
2743 *  -1: error
2744 *   0: NA
2745 *   1: ok
2746 *
2747 * XXX FIXME: this is not MP-safe, needs a lock to prevent multiple CPUs
2748 *            from executing at same time.
2749 */
2750int
2751stop_cpus(u_int map)
2752{
2753	int count = 0;
2754
2755	if (!smp_started)
2756		return 0;
2757
2758	/* send the Xcpustop IPI to all CPUs in map */
2759	selected_apic_ipi(map, XCPUSTOP_OFFSET, APIC_DELMODE_FIXED);
2760
2761	while (count++ < 100000 && (stopped_cpus & map) != map)
2762		/* spin */ ;
2763
2764#ifdef DIAGNOSTIC
2765	if ((stopped_cpus & map) != map)
2766		printf("Warning: CPUs 0x%x did not stop!\n",
2767		    (~(stopped_cpus & map)) & map);
2768#endif
2769
2770	return 1;
2771}
2772
2773
2774/*
2775 * Called by a CPU to restart stopped CPUs.
2776 *
2777 * Usually (but not necessarily) called with 'stopped_cpus' as its arg.
2778 *
2779 *  - Signals all CPUs in map to restart.
2780 *  - Waits for each to restart.
2781 *
2782 * Returns:
2783 *  -1: error
2784 *   0: NA
2785 *   1: ok
2786 */
2787int
2788restart_cpus(u_int map)
2789{
2790	int count = 0;
2791
2792	if (!smp_started)
2793		return 0;
2794
2795	started_cpus = map;		/* signal other cpus to restart */
2796
2797	/* wait for each to clear its bit */
2798	while (count++ < 100000 && (stopped_cpus & map) != 0)
2799		/* spin */ ;
2800
2801#ifdef DIAGNOSTIC
2802	if ((stopped_cpus & map) != 0)
2803		printf("Warning: CPUs 0x%x did not restart!\n",
2804		    (~(stopped_cpus & map)) & map);
2805#endif
2806
2807	return 1;
2808}
2809
2810
2811#ifdef APIC_INTR_REORDER
2812/*
2813 *	Maintain mapping from softintr vector to isr bit in local apic.
2814 */
2815void
2816set_lapic_isrloc(int intr, int vector)
2817{
2818	if (intr < 0 || intr > 32)
2819		panic("set_apic_isrloc: bad intr argument: %d",intr);
2820	if (vector < ICU_OFFSET || vector > 255)
2821		panic("set_apic_isrloc: bad vector argument: %d",vector);
2822	apic_isrbit_location[intr].location = &lapic.isr0 + ((vector>>5)<<2);
2823	apic_isrbit_location[intr].bit = (1<<(vector & 31));
2824}
2825#endif
2826
2827/*
2828 * All-CPU rendezvous.  CPUs are signalled, all execute the setup function
2829 * (if specified), rendezvous, execute the action function (if specified),
2830 * rendezvous again, execute the teardown function (if specified), and then
2831 * resume.
2832 *
2833 * Note that the supplied external functions _must_ be reentrant and aware
2834 * that they are running in parallel and in an unknown lock context.
2835 */
2836static void (*smp_rv_setup_func)(void *arg);
2837static void (*smp_rv_action_func)(void *arg);
2838static void (*smp_rv_teardown_func)(void *arg);
2839static void *smp_rv_func_arg;
2840static volatile int smp_rv_waiters[2];
2841
2842void
2843smp_rendezvous_action(void)
2844{
2845	/* setup function */
2846	if (smp_rv_setup_func != NULL)
2847		smp_rv_setup_func(smp_rv_func_arg);
2848	/* spin on entry rendezvous */
2849	atomic_add_int(&smp_rv_waiters[0], 1);
2850	while (smp_rv_waiters[0] < mp_ncpus)
2851		;
2852	/* action function */
2853	if (smp_rv_action_func != NULL)
2854		smp_rv_action_func(smp_rv_func_arg);
2855	/* spin on exit rendezvous */
2856	atomic_add_int(&smp_rv_waiters[1], 1);
2857	while (smp_rv_waiters[1] < mp_ncpus)
2858		;
2859	/* teardown function */
2860	if (smp_rv_teardown_func != NULL)
2861		smp_rv_teardown_func(smp_rv_func_arg);
2862}
2863
2864void
2865smp_rendezvous(void (* setup_func)(void *),
2866	       void (* action_func)(void *),
2867	       void (* teardown_func)(void *),
2868	       void *arg)
2869{
2870
2871	/* obtain rendezvous lock */
2872	mtx_lock_spin(&smp_rv_mtx);
2873
2874	/* set static function pointers */
2875	smp_rv_setup_func = setup_func;
2876	smp_rv_action_func = action_func;
2877	smp_rv_teardown_func = teardown_func;
2878	smp_rv_func_arg = arg;
2879	smp_rv_waiters[0] = 0;
2880	smp_rv_waiters[1] = 0;
2881
2882	/*
2883	 * signal other processors, which will enter the IPI with interrupts off
2884	 */
2885	all_but_self_ipi(XRENDEZVOUS_OFFSET);
2886
2887	/* call executor function */
2888	smp_rendezvous_action();
2889
2890	/* release lock */
2891	mtx_unlock_spin(&smp_rv_mtx);
2892}
2893
2894void
2895release_aps(void *dummy __unused)
2896{
2897	atomic_store_rel_int(&aps_ready, 1);
2898}
2899
2900SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
2901