mptable.c revision 47081
1/*
2 * Copyright (c) 1996, by Steve Passe
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. The name of the developer may NOT be used to endorse or promote products
11 *    derived from this software without specific prior written permission.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 *	$Id: mp_machdep.c,v 1.100 1999/05/08 17:48:22 peter Exp $
26 */
27
28#include "opt_smp.h"
29#include "opt_vm86.h"
30#include "opt_cpu.h"
31#include "opt_user_ldt.h"
32
33#ifdef SMP
34#include <machine/smptests.h>
35#else
36#error
37#endif
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/kernel.h>
42#include <sys/proc.h>
43#include <sys/sysctl.h>
44#include <sys/malloc.h>
45#include <sys/memrange.h>
46#ifdef BETTER_CLOCK
47#include <sys/dkstat.h>
48#endif
49
50#include <vm/vm.h>
51#include <vm/vm_param.h>
52#include <vm/pmap.h>
53#include <vm/vm_kern.h>
54#include <vm/vm_extern.h>
55#ifdef BETTER_CLOCK
56#include <sys/lock.h>
57#include <vm/vm_map.h>
58#include <sys/user.h>
59#ifdef GPROF
60#include <sys/gmon.h>
61#endif
62#endif
63
64#include <machine/smp.h>
65#include <machine/apic.h>
66#include <machine/mpapic.h>
67#include <machine/segments.h>
68#include <machine/smptests.h>	/** TEST_DEFAULT_CONFIG, TEST_TEST1 */
69#include <machine/tss.h>
70#include <machine/specialreg.h>
71#include <machine/cputypes.h>
72#include <machine/globaldata.h>
73
74#include <i386/i386/cons.h>	/* cngetc() */
75
76#if defined(APIC_IO)
77#include <machine/md_var.h>		/* setidt() */
78#include <i386/isa/icu.h>		/* IPIs */
79#include <i386/isa/intr_machdep.h>	/* IPIs */
80#endif	/* APIC_IO */
81
82#if defined(TEST_DEFAULT_CONFIG)
83#define MPFPS_MPFB1	TEST_DEFAULT_CONFIG
84#else
85#define MPFPS_MPFB1	mpfps->mpfb1
86#endif  /* TEST_DEFAULT_CONFIG */
87
88#define WARMBOOT_TARGET		0
89#define WARMBOOT_OFF		(KERNBASE + 0x0467)
90#define WARMBOOT_SEG		(KERNBASE + 0x0469)
91
92#ifdef PC98
93#define BIOS_BASE		(0xe8000)
94#define BIOS_SIZE		(0x18000)
95#else
96#define BIOS_BASE		(0xf0000)
97#define BIOS_SIZE		(0x10000)
98#endif
99#define BIOS_COUNT		(BIOS_SIZE/4)
100
101#define CMOS_REG		(0x70)
102#define CMOS_DATA		(0x71)
103#define BIOS_RESET		(0x0f)
104#define BIOS_WARM		(0x0a)
105
106#define PROCENTRY_FLAG_EN	0x01
107#define PROCENTRY_FLAG_BP	0x02
108#define IOAPICENTRY_FLAG_EN	0x01
109
110
111/* MP Floating Pointer Structure */
112typedef struct MPFPS {
113	char    signature[4];
114	void   *pap;
115	u_char  length;
116	u_char  spec_rev;
117	u_char  checksum;
118	u_char  mpfb1;
119	u_char  mpfb2;
120	u_char  mpfb3;
121	u_char  mpfb4;
122	u_char  mpfb5;
123}      *mpfps_t;
124
125/* MP Configuration Table Header */
126typedef struct MPCTH {
127	char    signature[4];
128	u_short base_table_length;
129	u_char  spec_rev;
130	u_char  checksum;
131	u_char  oem_id[8];
132	u_char  product_id[12];
133	void   *oem_table_pointer;
134	u_short oem_table_size;
135	u_short entry_count;
136	void   *apic_address;
137	u_short extended_table_length;
138	u_char  extended_table_checksum;
139	u_char  reserved;
140}      *mpcth_t;
141
142
143typedef struct PROCENTRY {
144	u_char  type;
145	u_char  apic_id;
146	u_char  apic_version;
147	u_char  cpu_flags;
148	u_long  cpu_signature;
149	u_long  feature_flags;
150	u_long  reserved1;
151	u_long  reserved2;
152}      *proc_entry_ptr;
153
154typedef struct BUSENTRY {
155	u_char  type;
156	u_char  bus_id;
157	char    bus_type[6];
158}      *bus_entry_ptr;
159
160typedef struct IOAPICENTRY {
161	u_char  type;
162	u_char  apic_id;
163	u_char  apic_version;
164	u_char  apic_flags;
165	void   *apic_address;
166}      *io_apic_entry_ptr;
167
168typedef struct INTENTRY {
169	u_char  type;
170	u_char  int_type;
171	u_short int_flags;
172	u_char  src_bus_id;
173	u_char  src_bus_irq;
174	u_char  dst_apic_id;
175	u_char  dst_apic_int;
176}      *int_entry_ptr;
177
178/* descriptions of MP basetable entries */
179typedef struct BASETABLE_ENTRY {
180	u_char  type;
181	u_char  length;
182	char    name[16];
183}       basetable_entry;
184
185/*
186 * this code MUST be enabled here and in mpboot.s.
187 * it follows the very early stages of AP boot by placing values in CMOS ram.
188 * it NORMALLY will never be needed and thus the primitive method for enabling.
189 *
190#define CHECK_POINTS
191 */
192
193#if defined(CHECK_POINTS) && !defined(PC98)
194#define CHECK_READ(A)	 (outb(CMOS_REG, (A)), inb(CMOS_DATA))
195#define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D)))
196
197#define CHECK_INIT(D);				\
198	CHECK_WRITE(0x34, (D));			\
199	CHECK_WRITE(0x35, (D));			\
200	CHECK_WRITE(0x36, (D));			\
201	CHECK_WRITE(0x37, (D));			\
202	CHECK_WRITE(0x38, (D));			\
203	CHECK_WRITE(0x39, (D));
204
205#define CHECK_PRINT(S);				\
206	printf("%s: %d, %d, %d, %d, %d, %d\n",	\
207	   (S),					\
208	   CHECK_READ(0x34),			\
209	   CHECK_READ(0x35),			\
210	   CHECK_READ(0x36),			\
211	   CHECK_READ(0x37),			\
212	   CHECK_READ(0x38),			\
213	   CHECK_READ(0x39));
214
215#else				/* CHECK_POINTS */
216
217#define CHECK_INIT(D)
218#define CHECK_PRINT(S)
219
220#endif				/* CHECK_POINTS */
221
222/*
223 * Values to send to the POST hardware.
224 */
225#define MP_BOOTADDRESS_POST	0x10
226#define MP_PROBE_POST		0x11
227#define MPTABLE_PASS1_POST	0x12
228
229#define MP_START_POST		0x13
230#define MP_ENABLE_POST		0x14
231#define MPTABLE_PASS2_POST	0x15
232
233#define START_ALL_APS_POST	0x16
234#define INSTALL_AP_TRAMP_POST	0x17
235#define START_AP_POST		0x18
236
237#define MP_ANNOUNCE_POST	0x19
238
239
240/** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */
241int	current_postcode;
242
243/** XXX FIXME: what system files declare these??? */
244extern struct region_descriptor r_gdt, r_idt;
245
246int	bsp_apic_ready = 0;	/* flags useability of BSP apic */
247int	mp_ncpus;		/* # of CPUs, including BSP */
248int	mp_naps;		/* # of Applications processors */
249int	mp_nbusses;		/* # of busses */
250int	mp_napics;		/* # of IO APICs */
251int	boot_cpu_id;		/* designated BSP */
252vm_offset_t cpu_apic_address;
253vm_offset_t io_apic_address[NAPICID];	/* NAPICID is more than enough */
254extern	int nkpt;
255
256u_int32_t cpu_apic_versions[NCPU];
257u_int32_t io_apic_versions[NAPIC];
258
259#ifdef APIC_INTR_DIAGNOSTIC
260int apic_itrace_enter[32];
261int apic_itrace_tryisrlock[32];
262int apic_itrace_gotisrlock[32];
263int apic_itrace_active[32];
264int apic_itrace_masked[32];
265int apic_itrace_noisrlock[32];
266int apic_itrace_masked2[32];
267int apic_itrace_unmask[32];
268int apic_itrace_noforward[32];
269int apic_itrace_leave[32];
270int apic_itrace_enter2[32];
271int apic_itrace_doreti[32];
272int apic_itrace_splz[32];
273int apic_itrace_eoi[32];
274#ifdef APIC_INTR_DIAGNOSTIC_IRQ
275unsigned short apic_itrace_debugbuffer[32768];
276int apic_itrace_debugbuffer_idx;
277struct simplelock apic_itrace_debuglock;
278#endif
279#endif
280
281#ifdef APIC_INTR_REORDER
282struct {
283	volatile int *location;
284	int bit;
285} apic_isrbit_location[32];
286#endif
287
288struct apic_intmapinfo	int_to_apicintpin[APIC_INTMAPSIZE];
289
290/*
291 * APIC ID logical/physical mapping structures.
292 * We oversize these to simplify boot-time config.
293 */
294int     cpu_num_to_apic_id[NAPICID];
295int     io_num_to_apic_id[NAPICID];
296int     apic_id_to_logical[NAPICID];
297
298
299/* Bitmap of all available CPUs */
300u_int	all_cpus;
301
302/* AP uses this during bootstrap.  Do not staticize.  */
303char *bootSTK;
304int boot_cpuid;
305
306/* Hotwire a 0->4MB V==P mapping */
307extern pt_entry_t *KPTphys;
308
309/* SMP page table page */
310extern pt_entry_t *SMPpt;
311
312struct pcb stoppcbs[NCPU];
313
314int smp_started;		/* has the system started? */
315
316/*
317 * Local data and functions.
318 */
319
320static int	mp_capable;
321static u_int	boot_address;
322static u_int	base_memory;
323
324static int	picmode;		/* 0: virtual wire mode, 1: PIC mode */
325static mpfps_t	mpfps;
326static int	search_for_sig(u_int32_t target, int count);
327static void	mp_enable(u_int boot_addr);
328
329static int	mptable_pass1(void);
330static int	mptable_pass2(void);
331static void	default_mp_table(int type);
332static void	fix_mp_table(void);
333static void	setup_apic_irq_mapping(void);
334static void	init_locks(void);
335static int	start_all_aps(u_int boot_addr);
336static void	install_ap_tramp(u_int boot_addr);
337static int	start_ap(int logicalCpu, u_int boot_addr);
338
339/*
340 * Calculate usable address in base memory for AP trampoline code.
341 */
342u_int
343mp_bootaddress(u_int basemem)
344{
345	POSTCODE(MP_BOOTADDRESS_POST);
346
347	base_memory = basemem * 1024;	/* convert to bytes */
348
349	boot_address = base_memory & ~0xfff;	/* round down to 4k boundary */
350	if ((base_memory - boot_address) < bootMP_size)
351		boot_address -= 4096;	/* not enough, lower by 4k */
352
353	return boot_address;
354}
355
356
357/*
358 * Look for an Intel MP spec table (ie, SMP capable hardware).
359 */
360int
361mp_probe(void)
362{
363	int     x;
364	u_long  segment;
365	u_int32_t target;
366
367	POSTCODE(MP_PROBE_POST);
368
369	/* see if EBDA exists */
370	if ((segment = (u_long) * (u_short *) (KERNBASE + 0x40e)) != 0) {
371		/* search first 1K of EBDA */
372		target = (u_int32_t) (segment << 4);
373		if ((x = search_for_sig(target, 1024 / 4)) >= 0)
374			goto found;
375	} else {
376		/* last 1K of base memory, effective 'top of base' passed in */
377		target = (u_int32_t) (base_memory - 0x400);
378		if ((x = search_for_sig(target, 1024 / 4)) >= 0)
379			goto found;
380	}
381
382	/* search the BIOS */
383	target = (u_int32_t) BIOS_BASE;
384	if ((x = search_for_sig(target, BIOS_COUNT)) >= 0)
385		goto found;
386
387	/* nothing found */
388	mpfps = (mpfps_t)0;
389	mp_capable = 0;
390	return 0;
391
392found:
393	/* calculate needed resources */
394	mpfps = (mpfps_t)x;
395	if (mptable_pass1())
396		panic("you must reconfigure your kernel");
397
398	/* flag fact that we are running multiple processors */
399	mp_capable = 1;
400	return 1;
401}
402
403
404/*
405 * Startup the SMP processors.
406 */
407void
408mp_start(void)
409{
410	POSTCODE(MP_START_POST);
411
412	/* look for MP capable motherboard */
413	if (mp_capable)
414		mp_enable(boot_address);
415	else
416		panic("MP hardware not found!");
417}
418
419
420/*
421 * Print various information about the SMP system hardware and setup.
422 */
423void
424mp_announce(void)
425{
426	int     x;
427
428	POSTCODE(MP_ANNOUNCE_POST);
429
430	printf("FreeBSD/SMP: Multiprocessor motherboard\n");
431	printf(" cpu0 (BSP): apic id: %2d", CPU_TO_ID(0));
432	printf(", version: 0x%08x", cpu_apic_versions[0]);
433	printf(", at 0x%08x\n", cpu_apic_address);
434	for (x = 1; x <= mp_naps; ++x) {
435		printf(" cpu%d (AP):  apic id: %2d", x, CPU_TO_ID(x));
436		printf(", version: 0x%08x", cpu_apic_versions[x]);
437		printf(", at 0x%08x\n", cpu_apic_address);
438	}
439
440#if defined(APIC_IO)
441	for (x = 0; x < mp_napics; ++x) {
442		printf(" io%d (APIC): apic id: %2d", x, IO_TO_ID(x));
443		printf(", version: 0x%08x", io_apic_versions[x]);
444		printf(", at 0x%08x\n", io_apic_address[x]);
445	}
446#else
447	printf(" Warning: APIC I/O disabled\n");
448#endif	/* APIC_IO */
449}
450
451/*
452 * AP cpu's call this to sync up protected mode.
453 */
454void
455init_secondary(void)
456{
457	int	gsel_tss;
458	int	x, myid = boot_cpuid;
459
460	gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[myid];
461	gdt_segs[GPROC0_SEL].ssd_base =
462		(int) &SMP_prvspace[myid].globaldata.gd_common_tss;
463	SMP_prvspace[myid].globaldata.gd_prvspace = &SMP_prvspace[myid];
464
465	for (x = 0; x < NGDT; x++) {
466		ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd);
467	}
468
469	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
470	r_gdt.rd_base = (int) &gdt[myid * NGDT];
471	lgdt(&r_gdt);			/* does magic intra-segment return */
472
473	lidt(&r_idt);
474
475	lldt(_default_ldt);
476#ifdef USER_LDT
477	currentldt = _default_ldt;
478#endif
479
480	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
481	gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
482	common_tss.tss_esp0 = 0;	/* not used until after switch */
483	common_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
484	common_tss.tss_ioopt = (sizeof common_tss) << 16;
485#ifdef VM86
486	tss_gdt = &gdt[myid * NGDT + GPROC0_SEL].sd;
487	common_tssd = *tss_gdt;
488#endif
489	ltr(gsel_tss);
490
491	load_cr0(0x8005003b);		/* XXX! */
492
493	pmap_set_opt((unsigned *)PTD);
494
495	invltlb();
496}
497
498
499#if defined(APIC_IO)
500/*
501 * Final configuration of the BSP's local APIC:
502 *  - disable 'pic mode'.
503 *  - disable 'virtual wire mode'.
504 *  - enable NMI.
505 */
506void
507bsp_apic_configure(void)
508{
509	u_char		byte;
510	u_int32_t	temp;
511
512	/* leave 'pic mode' if necessary */
513	if (picmode) {
514		outb(0x22, 0x70);	/* select IMCR */
515		byte = inb(0x23);	/* current contents */
516		byte |= 0x01;		/* mask external INTR */
517		outb(0x23, byte);	/* disconnect 8259s/NMI */
518	}
519
520	/* mask lint0 (the 8259 'virtual wire' connection) */
521	temp = lapic.lvt_lint0;
522	temp |= APIC_LVT_M;		/* set the mask */
523	lapic.lvt_lint0 = temp;
524
525        /* setup lint1 to handle NMI */
526        temp = lapic.lvt_lint1;
527        temp &= ~APIC_LVT_M;		/* clear the mask */
528        lapic.lvt_lint1 = temp;
529
530	if (bootverbose)
531		apic_dump("bsp_apic_configure()");
532}
533#endif  /* APIC_IO */
534
535
536/*******************************************************************
537 * local functions and data
538 */
539
540/*
541 * start the SMP system
542 */
543static void
544mp_enable(u_int boot_addr)
545{
546	int     x;
547#if defined(APIC_IO)
548	int     apic;
549	u_int   ux;
550#endif	/* APIC_IO */
551
552	POSTCODE(MP_ENABLE_POST);
553
554	/* turn on 4MB of V == P addressing so we can get to MP table */
555	*(int *)PTD = PG_V | PG_RW | ((uintptr_t)(void *)KPTphys & PG_FRAME);
556	invltlb();
557
558	/* examine the MP table for needed info, uses physical addresses */
559	x = mptable_pass2();
560
561	*(int *)PTD = 0;
562	invltlb();
563
564	/* can't process default configs till the CPU APIC is pmapped */
565	if (x)
566		default_mp_table(x);
567
568	/* post scan cleanup */
569	fix_mp_table();
570	setup_apic_irq_mapping();
571
572#if defined(APIC_IO)
573
574	/* fill the LOGICAL io_apic_versions table */
575	for (apic = 0; apic < mp_napics; ++apic) {
576		ux = io_apic_read(apic, IOAPIC_VER);
577		io_apic_versions[apic] = ux;
578	}
579
580	/* program each IO APIC in the system */
581	for (apic = 0; apic < mp_napics; ++apic)
582		if (io_apic_setup(apic) < 0)
583			panic("IO APIC setup failure");
584
585	/* install a 'Spurious INTerrupt' vector */
586	setidt(XSPURIOUSINT_OFFSET, Xspuriousint,
587	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
588
589	/* install an inter-CPU IPI for TLB invalidation */
590	setidt(XINVLTLB_OFFSET, Xinvltlb,
591	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
592
593#ifdef BETTER_CLOCK
594	/* install an inter-CPU IPI for reading processor state */
595	setidt(XCPUCHECKSTATE_OFFSET, Xcpucheckstate,
596	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
597#endif
598
599	/* install an inter-CPU IPI for forcing an additional software trap */
600	setidt(XCPUAST_OFFSET, Xcpuast,
601	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
602
603	/* install an inter-CPU IPI for interrupt forwarding */
604	setidt(XFORWARD_IRQ_OFFSET, Xforward_irq,
605	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
606
607	/* install an inter-CPU IPI for CPU stop/restart */
608	setidt(XCPUSTOP_OFFSET, Xcpustop,
609	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
610
611#if defined(TEST_TEST1)
612	/* install a "fake hardware INTerrupt" vector */
613	setidt(XTEST1_OFFSET, Xtest1,
614	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
615#endif  /** TEST_TEST1 */
616
617#endif	/* APIC_IO */
618
619	/* initialize all SMP locks */
620	init_locks();
621
622	/* start each Application Processor */
623	start_all_aps(boot_addr);
624
625	/*
626	 * The init process might be started on a different CPU now,
627	 * and the boot CPU might not call prepare_usermode to get
628	 * cr0 correctly configured. Thus we initialize cr0 here.
629	 */
630	load_cr0(rcr0() | CR0_WP | CR0_AM);
631}
632
633
634/*
635 * look for the MP spec signature
636 */
637
638/* string defined by the Intel MP Spec as identifying the MP table */
639#define MP_SIG		0x5f504d5f	/* _MP_ */
640#define NEXT(X)		((X) += 4)
641static int
642search_for_sig(u_int32_t target, int count)
643{
644	int     x;
645	u_int32_t *addr = (u_int32_t *) (KERNBASE + target);
646
647	for (x = 0; x < count; NEXT(x))
648		if (addr[x] == MP_SIG)
649			/* make array index a byte index */
650			return (target + (x * sizeof(u_int32_t)));
651
652	return -1;
653}
654
655
656static basetable_entry basetable_entry_types[] =
657{
658	{0, 20, "Processor"},
659	{1, 8, "Bus"},
660	{2, 8, "I/O APIC"},
661	{3, 8, "I/O INT"},
662	{4, 8, "Local INT"}
663};
664
665typedef struct BUSDATA {
666	u_char  bus_id;
667	enum busTypes bus_type;
668}       bus_datum;
669
670typedef struct INTDATA {
671	u_char  int_type;
672	u_short int_flags;
673	u_char  src_bus_id;
674	u_char  src_bus_irq;
675	u_char  dst_apic_id;
676	u_char  dst_apic_int;
677	u_char	int_vector;
678}       io_int, local_int;
679
680typedef struct BUSTYPENAME {
681	u_char  type;
682	char    name[7];
683}       bus_type_name;
684
685static bus_type_name bus_type_table[] =
686{
687	{CBUS, "CBUS"},
688	{CBUSII, "CBUSII"},
689	{EISA, "EISA"},
690	{UNKNOWN_BUSTYPE, "---"},
691	{UNKNOWN_BUSTYPE, "---"},
692	{ISA, "ISA"},
693	{UNKNOWN_BUSTYPE, "---"},
694	{UNKNOWN_BUSTYPE, "---"},
695	{UNKNOWN_BUSTYPE, "---"},
696	{UNKNOWN_BUSTYPE, "---"},
697	{UNKNOWN_BUSTYPE, "---"},
698	{UNKNOWN_BUSTYPE, "---"},
699	{PCI, "PCI"},
700	{UNKNOWN_BUSTYPE, "---"},
701	{UNKNOWN_BUSTYPE, "---"},
702	{UNKNOWN_BUSTYPE, "---"},
703	{UNKNOWN_BUSTYPE, "---"},
704	{XPRESS, "XPRESS"},
705	{UNKNOWN_BUSTYPE, "---"}
706};
707/* from MP spec v1.4, table 5-1 */
708static int default_data[7][5] =
709{
710/*   nbus, id0, type0, id1, type1 */
711	{1, 0, ISA, 255, 255},
712	{1, 0, EISA, 255, 255},
713	{1, 0, EISA, 255, 255},
714	{0, 255, 255, 255, 255},/* MCA not supported */
715	{2, 0, ISA, 1, PCI},
716	{2, 0, EISA, 1, PCI},
717	{0, 255, 255, 255, 255}	/* MCA not supported */
718};
719
720
721/* the bus data */
722static bus_datum bus_data[NBUS];
723
724/* the IO INT data, one entry per possible APIC INTerrupt */
725static io_int  io_apic_ints[NINTR];
726
727static int nintrs;
728
729static int processor_entry	__P((proc_entry_ptr entry, int cpu));
730static int bus_entry		__P((bus_entry_ptr entry, int bus));
731static int io_apic_entry	__P((io_apic_entry_ptr entry, int apic));
732static int int_entry		__P((int_entry_ptr entry, int intr));
733static int lookup_bus_type	__P((char *name));
734
735
736/*
737 * 1st pass on motherboard's Intel MP specification table.
738 *
739 * initializes:
740 *	mp_ncpus = 1
741 *
742 * determines:
743 *	cpu_apic_address (common to all CPUs)
744 *	io_apic_address[N]
745 *	mp_naps
746 *	mp_nbusses
747 *	mp_napics
748 *	nintrs
749 */
750static int
751mptable_pass1(void)
752{
753	int	x;
754	mpcth_t	cth;
755	int	totalSize;
756	void*	position;
757	int	count;
758	int	type;
759	int	mustpanic;
760
761	POSTCODE(MPTABLE_PASS1_POST);
762
763	mustpanic = 0;
764
765	/* clear various tables */
766	for (x = 0; x < NAPICID; ++x) {
767		io_apic_address[x] = ~0;	/* IO APIC address table */
768	}
769
770	/* init everything to empty */
771	mp_naps = 0;
772	mp_nbusses = 0;
773	mp_napics = 0;
774	nintrs = 0;
775
776	/* check for use of 'default' configuration */
777	if (MPFPS_MPFB1 != 0) {
778		/* use default addresses */
779		cpu_apic_address = DEFAULT_APIC_BASE;
780		io_apic_address[0] = DEFAULT_IO_APIC_BASE;
781
782		/* fill in with defaults */
783		mp_naps = 2;		/* includes BSP */
784		mp_nbusses = default_data[MPFPS_MPFB1 - 1][0];
785#if defined(APIC_IO)
786		mp_napics = 1;
787		nintrs = 16;
788#endif	/* APIC_IO */
789	}
790	else {
791		if ((cth = mpfps->pap) == 0)
792			panic("MP Configuration Table Header MISSING!");
793
794		cpu_apic_address = (vm_offset_t) cth->apic_address;
795
796		/* walk the table, recording info of interest */
797		totalSize = cth->base_table_length - sizeof(struct MPCTH);
798		position = (u_char *) cth + sizeof(struct MPCTH);
799		count = cth->entry_count;
800
801		while (count--) {
802			switch (type = *(u_char *) position) {
803			case 0: /* processor_entry */
804				if (((proc_entry_ptr)position)->cpu_flags
805					& PROCENTRY_FLAG_EN)
806					++mp_naps;
807				break;
808			case 1: /* bus_entry */
809				++mp_nbusses;
810				break;
811			case 2: /* io_apic_entry */
812				if (((io_apic_entry_ptr)position)->apic_flags
813					& IOAPICENTRY_FLAG_EN)
814					io_apic_address[mp_napics++] =
815					    (vm_offset_t)((io_apic_entry_ptr)
816						position)->apic_address;
817				break;
818			case 3: /* int_entry */
819				++nintrs;
820				break;
821			case 4:	/* int_entry */
822				break;
823			default:
824				panic("mpfps Base Table HOSED!");
825				/* NOTREACHED */
826			}
827
828			totalSize -= basetable_entry_types[type].length;
829			(u_char*)position += basetable_entry_types[type].length;
830		}
831	}
832
833	/* qualify the numbers */
834	if (mp_naps > NCPU)
835#if 0 /* XXX FIXME: kern/4255 */
836		printf("Warning: only using %d of %d available CPUs!\n",
837			NCPU, mp_naps);
838#else
839	{
840		printf("NCPU cannot be different than actual CPU count.\n");
841		printf(" add 'options NCPU=%d' to your kernel config file,\n",
842			mp_naps);
843		printf(" then rerun config & rebuild your SMP kernel\n");
844		mustpanic = 1;
845	}
846#endif /* XXX FIXME: kern/4255 */
847	if (mp_nbusses > NBUS) {
848		printf("found %d busses, increase NBUS\n", mp_nbusses);
849		mustpanic = 1;
850	}
851	if (mp_napics > NAPIC) {
852		printf("found %d apics, increase NAPIC\n", mp_napics);
853		mustpanic = 1;
854	}
855	if (nintrs > NINTR) {
856		printf("found %d intrs, increase NINTR\n", nintrs);
857		mustpanic = 1;
858	}
859
860	/*
861	 * Count the BSP.
862	 * This is also used as a counter while starting the APs.
863	 */
864	mp_ncpus = 1;
865
866	--mp_naps;	/* subtract the BSP */
867
868	return mustpanic;
869}
870
871
872/*
873 * 2nd pass on motherboard's Intel MP specification table.
874 *
875 * sets:
876 *	boot_cpu_id
877 *	ID_TO_IO(N), phy APIC ID to log CPU/IO table
878 *	CPU_TO_ID(N), logical CPU to APIC ID table
879 *	IO_TO_ID(N), logical IO to APIC ID table
880 *	bus_data[N]
881 *	io_apic_ints[N]
882 */
883static int
884mptable_pass2(void)
885{
886	int     x;
887	mpcth_t cth;
888	int     totalSize;
889	void*   position;
890	int     count;
891	int     type;
892	int     apic, bus, cpu, intr;
893
894	POSTCODE(MPTABLE_PASS2_POST);
895
896	/* clear various tables */
897	for (x = 0; x < NAPICID; ++x) {
898		ID_TO_IO(x) = -1;	/* phy APIC ID to log CPU/IO table */
899		CPU_TO_ID(x) = -1;	/* logical CPU to APIC ID table */
900		IO_TO_ID(x) = -1;	/* logical IO to APIC ID table */
901	}
902
903	/* clear bus data table */
904	for (x = 0; x < NBUS; ++x)
905		bus_data[x].bus_id = 0xff;
906
907	/* clear IO APIC INT table */
908	for (x = 0; x < NINTR; ++x) {
909		io_apic_ints[x].int_type = 0xff;
910		io_apic_ints[x].int_vector = 0xff;
911	}
912
913	/* setup the cpu/apic mapping arrays */
914	boot_cpu_id = -1;
915
916	/* record whether PIC or virtual-wire mode */
917	picmode = (mpfps->mpfb2 & 0x80) ? 1 : 0;
918
919	/* check for use of 'default' configuration */
920	if (MPFPS_MPFB1 != 0)
921		return MPFPS_MPFB1;	/* return default configuration type */
922
923	if ((cth = mpfps->pap) == 0)
924		panic("MP Configuration Table Header MISSING!");
925
926	/* walk the table, recording info of interest */
927	totalSize = cth->base_table_length - sizeof(struct MPCTH);
928	position = (u_char *) cth + sizeof(struct MPCTH);
929	count = cth->entry_count;
930	apic = bus = intr = 0;
931	cpu = 1;				/* pre-count the BSP */
932
933	while (count--) {
934		switch (type = *(u_char *) position) {
935		case 0:
936			if (processor_entry(position, cpu))
937				++cpu;
938			break;
939		case 1:
940			if (bus_entry(position, bus))
941				++bus;
942			break;
943		case 2:
944			if (io_apic_entry(position, apic))
945				++apic;
946			break;
947		case 3:
948			if (int_entry(position, intr))
949				++intr;
950			break;
951		case 4:
952			/* int_entry(position); */
953			break;
954		default:
955			panic("mpfps Base Table HOSED!");
956			/* NOTREACHED */
957		}
958
959		totalSize -= basetable_entry_types[type].length;
960		(u_char *) position += basetable_entry_types[type].length;
961	}
962
963	if (boot_cpu_id == -1)
964		panic("NO BSP found!");
965
966	/* report fact that its NOT a default configuration */
967	return 0;
968}
969
970
971static void
972assign_apic_irq(int apic, int intpin, int irq)
973{
974	int x;
975
976	if (int_to_apicintpin[irq].ioapic != -1)
977		panic("assign_apic_irq: inconsistent table");
978
979	int_to_apicintpin[irq].ioapic = apic;
980	int_to_apicintpin[irq].int_pin = intpin;
981	int_to_apicintpin[irq].apic_address = ioapic[apic];
982	int_to_apicintpin[irq].redirindex = IOAPIC_REDTBL + 2 * intpin;
983
984	for (x = 0; x < nintrs; x++) {
985		if ((io_apic_ints[x].int_type == 0 ||
986		     io_apic_ints[x].int_type == 3) &&
987		    io_apic_ints[x].int_vector == 0xff &&
988		    io_apic_ints[x].dst_apic_id == IO_TO_ID(apic) &&
989		    io_apic_ints[x].dst_apic_int == intpin)
990			io_apic_ints[x].int_vector = irq;
991	}
992}
993
994/*
995 * parse an Intel MP specification table
996 */
997static void
998fix_mp_table(void)
999{
1000	int	x;
1001	int	id;
1002	int	bus_0 = 0;	/* Stop GCC warning */
1003	int	bus_pci = 0;	/* Stop GCC warning */
1004	int	num_pci_bus;
1005
1006	/*
1007	 * Fix mis-numbering of the PCI bus and its INT entries if the BIOS
1008	 * did it wrong.  The MP spec says that when more than 1 PCI bus
1009	 * exists the BIOS must begin with bus entries for the PCI bus and use
1010	 * actual PCI bus numbering.  This implies that when only 1 PCI bus
1011	 * exists the BIOS can choose to ignore this ordering, and indeed many
1012	 * MP motherboards do ignore it.  This causes a problem when the PCI
1013	 * sub-system makes requests of the MP sub-system based on PCI bus
1014	 * numbers.	So here we look for the situation and renumber the
1015	 * busses and associated INTs in an effort to "make it right".
1016	 */
1017
1018	/* find bus 0, PCI bus, count the number of PCI busses */
1019	for (num_pci_bus = 0, x = 0; x < mp_nbusses; ++x) {
1020		if (bus_data[x].bus_id == 0) {
1021			bus_0 = x;
1022		}
1023		if (bus_data[x].bus_type == PCI) {
1024			++num_pci_bus;
1025			bus_pci = x;
1026		}
1027	}
1028	/*
1029	 * bus_0 == slot of bus with ID of 0
1030	 * bus_pci == slot of last PCI bus encountered
1031	 */
1032
1033	/* check the 1 PCI bus case for sanity */
1034	if (num_pci_bus == 1) {
1035
1036		/* if it is number 0 all is well */
1037		if (bus_data[bus_pci].bus_id == 0)
1038			return;
1039
1040		/* mis-numbered, swap with whichever bus uses slot 0 */
1041
1042		/* swap the bus entry types */
1043		bus_data[bus_pci].bus_type = bus_data[bus_0].bus_type;
1044		bus_data[bus_0].bus_type = PCI;
1045
1046		/* swap each relavant INTerrupt entry */
1047		id = bus_data[bus_pci].bus_id;
1048		for (x = 0; x < nintrs; ++x) {
1049			if (io_apic_ints[x].src_bus_id == id) {
1050				io_apic_ints[x].src_bus_id = 0;
1051			}
1052			else if (io_apic_ints[x].src_bus_id == 0) {
1053				io_apic_ints[x].src_bus_id = id;
1054			}
1055		}
1056	}
1057	/* sanity check if more than 1 PCI bus */
1058	else if (num_pci_bus > 1) {
1059		for (x = 0; x < mp_nbusses; ++x) {
1060			if (bus_data[x].bus_type != PCI)
1061				continue;
1062			if (bus_data[x].bus_id >= num_pci_bus)
1063				panic("bad PCI bus numbering");
1064		}
1065	}
1066}
1067
1068
1069static void
1070setup_apic_irq_mapping(void)
1071{
1072	int	x;
1073	int	int_vector;
1074
1075	/* Assign low level interrupt handlers */
1076	for (x = 0; x < APIC_INTMAPSIZE; x++) {
1077		int_to_apicintpin[x].ioapic = -1;
1078		int_to_apicintpin[x].int_pin = 0;
1079		int_to_apicintpin[x].apic_address = NULL;
1080		int_to_apicintpin[x].redirindex = 0;
1081	}
1082	for (x = 0; x < nintrs; x++) {
1083		if (io_apic_ints[x].dst_apic_int < APIC_INTMAPSIZE &&
1084		    io_apic_ints[x].dst_apic_id == IO_TO_ID(0) &&
1085		    io_apic_ints[x].int_vector == 0xff &&
1086		    (io_apic_ints[x].int_type == 0 ||
1087		     io_apic_ints[x].int_type == 3)) {
1088			assign_apic_irq(0,
1089					io_apic_ints[x].dst_apic_int,
1090					io_apic_ints[x].dst_apic_int);
1091		}
1092	}
1093	int_vector = 0;
1094	while (int_vector < APIC_INTMAPSIZE &&
1095	       int_to_apicintpin[int_vector].ioapic != -1)
1096		int_vector++;
1097	for (x = 0; x < nintrs && int_vector < APIC_INTMAPSIZE; x++) {
1098		if ((io_apic_ints[x].int_type == 0 ||
1099		     io_apic_ints[x].int_type == 3) &&
1100		    io_apic_ints[x].int_vector == 0xff) {
1101			assign_apic_irq(ID_TO_IO(io_apic_ints[x].dst_apic_id),
1102					io_apic_ints[x].dst_apic_int,
1103					int_vector);
1104			int_vector++;
1105			while (int_vector < APIC_INTMAPSIZE &&
1106			       int_to_apicintpin[int_vector].ioapic != -1)
1107				int_vector++;
1108		}
1109	}
1110}
1111
1112
1113static int
1114processor_entry(proc_entry_ptr entry, int cpu)
1115{
1116	/* check for usability */
1117	if ((cpu >= NCPU) || !(entry->cpu_flags & PROCENTRY_FLAG_EN))
1118		return 0;
1119
1120	/* check for BSP flag */
1121	if (entry->cpu_flags & PROCENTRY_FLAG_BP) {
1122		boot_cpu_id = entry->apic_id;
1123		CPU_TO_ID(0) = entry->apic_id;
1124		ID_TO_CPU(entry->apic_id) = 0;
1125		return 0;	/* its already been counted */
1126	}
1127
1128	/* add another AP to list, if less than max number of CPUs */
1129	else {
1130		CPU_TO_ID(cpu) = entry->apic_id;
1131		ID_TO_CPU(entry->apic_id) = cpu;
1132		return 1;
1133	}
1134}
1135
1136
1137static int
1138bus_entry(bus_entry_ptr entry, int bus)
1139{
1140	int     x;
1141	char    c, name[8];
1142
1143	/* encode the name into an index */
1144	for (x = 0; x < 6; ++x) {
1145		if ((c = entry->bus_type[x]) == ' ')
1146			break;
1147		name[x] = c;
1148	}
1149	name[x] = '\0';
1150
1151	if ((x = lookup_bus_type(name)) == UNKNOWN_BUSTYPE)
1152		panic("unknown bus type: '%s'", name);
1153
1154	bus_data[bus].bus_id = entry->bus_id;
1155	bus_data[bus].bus_type = x;
1156
1157	return 1;
1158}
1159
1160
1161static int
1162io_apic_entry(io_apic_entry_ptr entry, int apic)
1163{
1164	if (!(entry->apic_flags & IOAPICENTRY_FLAG_EN))
1165		return 0;
1166
1167	IO_TO_ID(apic) = entry->apic_id;
1168	ID_TO_IO(entry->apic_id) = apic;
1169
1170	return 1;
1171}
1172
1173
1174static int
1175lookup_bus_type(char *name)
1176{
1177	int     x;
1178
1179	for (x = 0; x < MAX_BUSTYPE; ++x)
1180		if (strcmp(bus_type_table[x].name, name) == 0)
1181			return bus_type_table[x].type;
1182
1183	return UNKNOWN_BUSTYPE;
1184}
1185
1186
1187static int
1188int_entry(int_entry_ptr entry, int intr)
1189{
1190	int apic;
1191
1192	io_apic_ints[intr].int_type = entry->int_type;
1193	io_apic_ints[intr].int_flags = entry->int_flags;
1194	io_apic_ints[intr].src_bus_id = entry->src_bus_id;
1195	io_apic_ints[intr].src_bus_irq = entry->src_bus_irq;
1196	if (entry->dst_apic_id == 255) {
1197		/* This signal goes to all IO APICS.  Select an IO APIC
1198		   with sufficient number of interrupt pins */
1199		for (apic = 0; apic < mp_napics; apic++)
1200			if (((io_apic_read(apic, IOAPIC_VER) &
1201			      IOART_VER_MAXREDIR) >> MAXREDIRSHIFT) >=
1202			    entry->dst_apic_int)
1203				break;
1204		if (apic < mp_napics)
1205			io_apic_ints[intr].dst_apic_id = IO_TO_ID(apic);
1206		else
1207			io_apic_ints[intr].dst_apic_id = entry->dst_apic_id;
1208	} else
1209		io_apic_ints[intr].dst_apic_id = entry->dst_apic_id;
1210	io_apic_ints[intr].dst_apic_int = entry->dst_apic_int;
1211
1212	return 1;
1213}
1214
1215
1216static int
1217apic_int_is_bus_type(int intr, int bus_type)
1218{
1219	int     bus;
1220
1221	for (bus = 0; bus < mp_nbusses; ++bus)
1222		if ((bus_data[bus].bus_id == io_apic_ints[intr].src_bus_id)
1223		    && ((int) bus_data[bus].bus_type == bus_type))
1224			return 1;
1225
1226	return 0;
1227}
1228
1229
1230/*
1231 * Given a traditional ISA INT mask, return an APIC mask.
1232 */
1233u_int
1234isa_apic_mask(u_int isa_mask)
1235{
1236	int isa_irq;
1237	int apic_pin;
1238
1239#if defined(SKIP_IRQ15_REDIRECT)
1240	if (isa_mask == (1 << 15)) {
1241		printf("skipping ISA IRQ15 redirect\n");
1242		return isa_mask;
1243	}
1244#endif  /* SKIP_IRQ15_REDIRECT */
1245
1246	isa_irq = ffs(isa_mask);		/* find its bit position */
1247	if (isa_irq == 0)			/* doesn't exist */
1248		return 0;
1249	--isa_irq;				/* make it zero based */
1250
1251	apic_pin = isa_apic_irq(isa_irq);	/* look for APIC connection */
1252	if (apic_pin == -1)
1253		return 0;
1254
1255	return (1 << apic_pin);			/* convert pin# to a mask */
1256}
1257
1258
1259/*
1260 * Determine which APIC pin an ISA/EISA INT is attached to.
1261 */
1262#define INTTYPE(I)	(io_apic_ints[(I)].int_type)
1263#define INTPIN(I)	(io_apic_ints[(I)].dst_apic_int)
1264#define INTIRQ(I)	(io_apic_ints[(I)].int_vector)
1265#define INTAPIC(I)	(ID_TO_IO(io_apic_ints[(I)].dst_apic_id))
1266
1267#define SRCBUSIRQ(I)	(io_apic_ints[(I)].src_bus_irq)
1268int
1269isa_apic_irq(int isa_irq)
1270{
1271	int     intr;
1272
1273	for (intr = 0; intr < nintrs; ++intr) {		/* check each record */
1274		if (INTTYPE(intr) == 0) {		/* standard INT */
1275			if (SRCBUSIRQ(intr) == isa_irq) {
1276				if (apic_int_is_bus_type(intr, ISA) ||
1277			            apic_int_is_bus_type(intr, EISA))
1278					return INTIRQ(intr);	/* found */
1279			}
1280		}
1281	}
1282	return -1;					/* NOT found */
1283}
1284
1285
1286/*
1287 * Determine which APIC pin a PCI INT is attached to.
1288 */
1289#define SRCBUSID(I)	(io_apic_ints[(I)].src_bus_id)
1290#define SRCBUSDEVICE(I)	((io_apic_ints[(I)].src_bus_irq >> 2) & 0x1f)
1291#define SRCBUSLINE(I)	(io_apic_ints[(I)].src_bus_irq & 0x03)
1292int
1293pci_apic_irq(int pciBus, int pciDevice, int pciInt)
1294{
1295	int     intr;
1296
1297	--pciInt;					/* zero based */
1298
1299	for (intr = 0; intr < nintrs; ++intr)		/* check each record */
1300		if ((INTTYPE(intr) == 0)		/* standard INT */
1301		    && (SRCBUSID(intr) == pciBus)
1302		    && (SRCBUSDEVICE(intr) == pciDevice)
1303		    && (SRCBUSLINE(intr) == pciInt))	/* a candidate IRQ */
1304			if (apic_int_is_bus_type(intr, PCI))
1305				return INTIRQ(intr);	/* exact match */
1306
1307	return -1;					/* NOT found */
1308}
1309
1310int
1311next_apic_irq(int irq)
1312{
1313	int intr, ointr;
1314	int bus, bustype;
1315
1316	bus = 0;
1317	bustype = 0;
1318	for (intr = 0; intr < nintrs; intr++) {
1319		if (INTIRQ(intr) != irq || INTTYPE(intr) != 0)
1320			continue;
1321		bus = SRCBUSID(intr);
1322		bustype = apic_bus_type(bus);
1323		if (bustype != ISA &&
1324		    bustype != EISA &&
1325		    bustype != PCI)
1326			continue;
1327		break;
1328	}
1329	if (intr >= nintrs) {
1330		return -1;
1331	}
1332	for (ointr = intr + 1; ointr < nintrs; ointr++) {
1333		if (INTTYPE(ointr) != 0)
1334			continue;
1335		if (bus != SRCBUSID(ointr))
1336			continue;
1337		if (bustype == PCI) {
1338			if (SRCBUSDEVICE(intr) != SRCBUSDEVICE(ointr))
1339				continue;
1340			if (SRCBUSLINE(intr) != SRCBUSLINE(ointr))
1341				continue;
1342		}
1343		if (bustype == ISA || bustype == EISA) {
1344			if (SRCBUSIRQ(intr) != SRCBUSIRQ(ointr))
1345				continue;
1346		}
1347		if (INTPIN(intr) == INTPIN(ointr))
1348			continue;
1349		break;
1350	}
1351	if (ointr >= nintrs) {
1352		return -1;
1353	}
1354	return INTIRQ(ointr);
1355}
1356#undef SRCBUSLINE
1357#undef SRCBUSDEVICE
1358#undef SRCBUSID
1359#undef SRCBUSIRQ
1360
1361#undef INTPIN
1362#undef INTIRQ
1363#undef INTAPIC
1364#undef INTTYPE
1365
1366
1367/*
1368 * Reprogram the MB chipset to NOT redirect an ISA INTerrupt.
1369 *
1370 * XXX FIXME:
1371 *  Exactly what this means is unclear at this point.  It is a solution
1372 *  for motherboards that redirect the MBIRQ0 pin.  Generically a motherboard
1373 *  could route any of the ISA INTs to upper (>15) IRQ values.  But most would
1374 *  NOT be redirected via MBIRQ0, thus "undirect()ing" them would NOT be an
1375 *  option.
1376 */
1377int
1378undirect_isa_irq(int rirq)
1379{
1380#if defined(READY)
1381	if (bootverbose)
1382	    printf("Freeing redirected ISA irq %d.\n", rirq);
1383	/** FIXME: tickle the MB redirector chip */
1384	return ???;
1385#else
1386	if (bootverbose)
1387	    printf("Freeing (NOT implemented) redirected ISA irq %d.\n", rirq);
1388	return 0;
1389#endif  /* READY */
1390}
1391
1392
1393/*
1394 * Reprogram the MB chipset to NOT redirect a PCI INTerrupt
1395 */
1396int
1397undirect_pci_irq(int rirq)
1398{
1399#if defined(READY)
1400	if (bootverbose)
1401		printf("Freeing redirected PCI irq %d.\n", rirq);
1402
1403	/** FIXME: tickle the MB redirector chip */
1404	return ???;
1405#else
1406	if (bootverbose)
1407		printf("Freeing (NOT implemented) redirected PCI irq %d.\n",
1408		       rirq);
1409	return 0;
1410#endif  /* READY */
1411}
1412
1413
1414/*
1415 * given a bus ID, return:
1416 *  the bus type if found
1417 *  -1 if NOT found
1418 */
1419int
1420apic_bus_type(int id)
1421{
1422	int     x;
1423
1424	for (x = 0; x < mp_nbusses; ++x)
1425		if (bus_data[x].bus_id == id)
1426			return bus_data[x].bus_type;
1427
1428	return -1;
1429}
1430
1431
1432/*
1433 * given a LOGICAL APIC# and pin#, return:
1434 *  the associated src bus ID if found
1435 *  -1 if NOT found
1436 */
1437int
1438apic_src_bus_id(int apic, int pin)
1439{
1440	int     x;
1441
1442	/* search each of the possible INTerrupt sources */
1443	for (x = 0; x < nintrs; ++x)
1444		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1445		    (pin == io_apic_ints[x].dst_apic_int))
1446			return (io_apic_ints[x].src_bus_id);
1447
1448	return -1;		/* NOT found */
1449}
1450
1451
1452/*
1453 * given a LOGICAL APIC# and pin#, return:
1454 *  the associated src bus IRQ if found
1455 *  -1 if NOT found
1456 */
1457int
1458apic_src_bus_irq(int apic, int pin)
1459{
1460	int     x;
1461
1462	for (x = 0; x < nintrs; x++)
1463		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1464		    (pin == io_apic_ints[x].dst_apic_int))
1465			return (io_apic_ints[x].src_bus_irq);
1466
1467	return -1;		/* NOT found */
1468}
1469
1470
1471/*
1472 * given a LOGICAL APIC# and pin#, return:
1473 *  the associated INTerrupt type if found
1474 *  -1 if NOT found
1475 */
1476int
1477apic_int_type(int apic, int pin)
1478{
1479	int     x;
1480
1481	/* search each of the possible INTerrupt sources */
1482	for (x = 0; x < nintrs; ++x)
1483		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1484		    (pin == io_apic_ints[x].dst_apic_int))
1485			return (io_apic_ints[x].int_type);
1486
1487	return -1;		/* NOT found */
1488}
1489
1490int
1491apic_irq(int apic, int pin)
1492{
1493	int x;
1494	int res;
1495
1496	for (x = 0; x < nintrs; ++x)
1497		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1498		    (pin == io_apic_ints[x].dst_apic_int)) {
1499			res = io_apic_ints[x].int_vector;
1500			if (res == 0xff)
1501				return -1;
1502			if (apic != int_to_apicintpin[res].ioapic)
1503				panic("apic_irq: inconsistent table");
1504			if (pin != int_to_apicintpin[res].int_pin)
1505				panic("apic_irq inconsistent table (2)");
1506			return res;
1507		}
1508	return -1;
1509}
1510
1511
1512/*
1513 * given a LOGICAL APIC# and pin#, return:
1514 *  the associated trigger mode if found
1515 *  -1 if NOT found
1516 */
1517int
1518apic_trigger(int apic, int pin)
1519{
1520	int     x;
1521
1522	/* search each of the possible INTerrupt sources */
1523	for (x = 0; x < nintrs; ++x)
1524		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1525		    (pin == io_apic_ints[x].dst_apic_int))
1526			return ((io_apic_ints[x].int_flags >> 2) & 0x03);
1527
1528	return -1;		/* NOT found */
1529}
1530
1531
1532/*
1533 * given a LOGICAL APIC# and pin#, return:
1534 *  the associated 'active' level if found
1535 *  -1 if NOT found
1536 */
1537int
1538apic_polarity(int apic, int pin)
1539{
1540	int     x;
1541
1542	/* search each of the possible INTerrupt sources */
1543	for (x = 0; x < nintrs; ++x)
1544		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1545		    (pin == io_apic_ints[x].dst_apic_int))
1546			return (io_apic_ints[x].int_flags & 0x03);
1547
1548	return -1;		/* NOT found */
1549}
1550
1551
1552/*
1553 * set data according to MP defaults
1554 * FIXME: probably not complete yet...
1555 */
1556static void
1557default_mp_table(int type)
1558{
1559	int     ap_cpu_id;
1560#if defined(APIC_IO)
1561	u_int32_t ux;
1562	int     io_apic_id;
1563	int     pin;
1564#endif	/* APIC_IO */
1565
1566#if 0
1567	printf("  MP default config type: %d\n", type);
1568	switch (type) {
1569	case 1:
1570		printf("   bus: ISA, APIC: 82489DX\n");
1571		break;
1572	case 2:
1573		printf("   bus: EISA, APIC: 82489DX\n");
1574		break;
1575	case 3:
1576		printf("   bus: EISA, APIC: 82489DX\n");
1577		break;
1578	case 4:
1579		printf("   bus: MCA, APIC: 82489DX\n");
1580		break;
1581	case 5:
1582		printf("   bus: ISA+PCI, APIC: Integrated\n");
1583		break;
1584	case 6:
1585		printf("   bus: EISA+PCI, APIC: Integrated\n");
1586		break;
1587	case 7:
1588		printf("   bus: MCA+PCI, APIC: Integrated\n");
1589		break;
1590	default:
1591		printf("   future type\n");
1592		break;
1593		/* NOTREACHED */
1594	}
1595#endif	/* 0 */
1596
1597	boot_cpu_id = (lapic.id & APIC_ID_MASK) >> 24;
1598	ap_cpu_id = (boot_cpu_id == 0) ? 1 : 0;
1599
1600	/* BSP */
1601	CPU_TO_ID(0) = boot_cpu_id;
1602	ID_TO_CPU(boot_cpu_id) = 0;
1603
1604	/* one and only AP */
1605	CPU_TO_ID(1) = ap_cpu_id;
1606	ID_TO_CPU(ap_cpu_id) = 1;
1607
1608#if defined(APIC_IO)
1609	/* one and only IO APIC */
1610	io_apic_id = (io_apic_read(0, IOAPIC_ID) & APIC_ID_MASK) >> 24;
1611
1612	/*
1613	 * sanity check, refer to MP spec section 3.6.6, last paragraph
1614	 * necessary as some hardware isn't properly setting up the IO APIC
1615	 */
1616#if defined(REALLY_ANAL_IOAPICID_VALUE)
1617	if (io_apic_id != 2) {
1618#else
1619	if ((io_apic_id == 0) || (io_apic_id == 1) || (io_apic_id == 15)) {
1620#endif	/* REALLY_ANAL_IOAPICID_VALUE */
1621		ux = io_apic_read(0, IOAPIC_ID);	/* get current contents */
1622		ux &= ~APIC_ID_MASK;	/* clear the ID field */
1623		ux |= 0x02000000;	/* set it to '2' */
1624		io_apic_write(0, IOAPIC_ID, ux);	/* write new value */
1625		ux = io_apic_read(0, IOAPIC_ID);	/* re-read && test */
1626		if ((ux & APIC_ID_MASK) != 0x02000000)
1627			panic("can't control IO APIC ID, reg: 0x%08x", ux);
1628		io_apic_id = 2;
1629	}
1630	IO_TO_ID(0) = io_apic_id;
1631	ID_TO_IO(io_apic_id) = 0;
1632#endif	/* APIC_IO */
1633
1634	/* fill out bus entries */
1635	switch (type) {
1636	case 1:
1637	case 2:
1638	case 3:
1639	case 5:
1640	case 6:
1641		bus_data[0].bus_id = default_data[type - 1][1];
1642		bus_data[0].bus_type = default_data[type - 1][2];
1643		bus_data[1].bus_id = default_data[type - 1][3];
1644		bus_data[1].bus_type = default_data[type - 1][4];
1645		break;
1646
1647	/* case 4: case 7:		   MCA NOT supported */
1648	default:		/* illegal/reserved */
1649		panic("BAD default MP config: %d", type);
1650		/* NOTREACHED */
1651	}
1652
1653#if defined(APIC_IO)
1654	/* general cases from MP v1.4, table 5-2 */
1655	for (pin = 0; pin < 16; ++pin) {
1656		io_apic_ints[pin].int_type = 0;
1657		io_apic_ints[pin].int_flags = 0x05;	/* edge/active-hi */
1658		io_apic_ints[pin].src_bus_id = 0;
1659		io_apic_ints[pin].src_bus_irq = pin;	/* IRQ2 caught below */
1660		io_apic_ints[pin].dst_apic_id = io_apic_id;
1661		io_apic_ints[pin].dst_apic_int = pin;	/* 1-to-1 */
1662	}
1663
1664	/* special cases from MP v1.4, table 5-2 */
1665	if (type == 2) {
1666		io_apic_ints[2].int_type = 0xff;	/* N/C */
1667		io_apic_ints[13].int_type = 0xff;	/* N/C */
1668#if !defined(APIC_MIXED_MODE)
1669		/** FIXME: ??? */
1670		panic("sorry, can't support type 2 default yet");
1671#endif	/* APIC_MIXED_MODE */
1672	}
1673	else
1674		io_apic_ints[2].src_bus_irq = 0;	/* ISA IRQ0 is on APIC INT 2 */
1675
1676	if (type == 7)
1677		io_apic_ints[0].int_type = 0xff;	/* N/C */
1678	else
1679		io_apic_ints[0].int_type = 3;	/* vectored 8259 */
1680#endif	/* APIC_IO */
1681}
1682
1683
1684/*
1685 * initialize all the SMP locks
1686 */
1687
1688/* critical region around IO APIC, apic_imen */
1689struct simplelock	imen_lock;
1690
1691/* critical region around splxx(), cpl, cml, cil, ipending */
1692struct simplelock	cpl_lock;
1693
1694/* Make FAST_INTR() routines sequential */
1695struct simplelock	fast_intr_lock;
1696
1697/* critical region around INTR() routines */
1698struct simplelock	intr_lock;
1699
1700/* lock regions protected in UP kernel via cli/sti */
1701struct simplelock	mpintr_lock;
1702
1703/* lock region used by kernel profiling */
1704struct simplelock	mcount_lock;
1705
1706#ifdef USE_COMLOCK
1707/* locks com (tty) data/hardware accesses: a FASTINTR() */
1708struct simplelock	com_lock;
1709#endif /* USE_COMLOCK */
1710
1711#ifdef USE_CLOCKLOCK
1712/* lock regions around the clock hardware */
1713struct simplelock	clock_lock;
1714#endif /* USE_CLOCKLOCK */
1715
1716static void
1717init_locks(void)
1718{
1719	/*
1720	 * Get the initial mp_lock with a count of 1 for the BSP.
1721	 * This uses a LOGICAL cpu ID, ie BSP == 0.
1722	 */
1723	mp_lock = 0x00000001;
1724
1725	/* ISR uses its own "giant lock" */
1726	isr_lock = FREE_LOCK;
1727
1728#if defined(APIC_INTR_DIAGNOSTIC) && defined(APIC_INTR_DIAGNOSTIC_IRQ)
1729	s_lock_init((struct simplelock*)&apic_itrace_debuglock);
1730#endif
1731
1732	s_lock_init((struct simplelock*)&mpintr_lock);
1733
1734	s_lock_init((struct simplelock*)&mcount_lock);
1735
1736	s_lock_init((struct simplelock*)&fast_intr_lock);
1737	s_lock_init((struct simplelock*)&intr_lock);
1738	s_lock_init((struct simplelock*)&imen_lock);
1739	s_lock_init((struct simplelock*)&cpl_lock);
1740
1741#ifdef USE_COMLOCK
1742	s_lock_init((struct simplelock*)&com_lock);
1743#endif /* USE_COMLOCK */
1744#ifdef USE_CLOCKLOCK
1745	s_lock_init((struct simplelock*)&clock_lock);
1746#endif /* USE_CLOCKLOCK */
1747}
1748
1749
1750/* Wait for all APs to be fully initialized */
1751extern int wait_ap(unsigned int);
1752
1753/*
1754 * start each AP in our list
1755 */
1756static int
1757start_all_aps(u_int boot_addr)
1758{
1759	int     x, i, pg;
1760	u_char  mpbiosreason;
1761	u_long  mpbioswarmvec;
1762	struct globaldata *gd;
1763	char *stack;
1764
1765	POSTCODE(START_ALL_APS_POST);
1766
1767	/* initialize BSP's local APIC */
1768	apic_initialize();
1769	bsp_apic_ready = 1;
1770
1771	/* install the AP 1st level boot code */
1772	install_ap_tramp(boot_addr);
1773
1774
1775	/* save the current value of the warm-start vector */
1776	mpbioswarmvec = *((u_long *) WARMBOOT_OFF);
1777#ifndef PC98
1778	outb(CMOS_REG, BIOS_RESET);
1779	mpbiosreason = inb(CMOS_DATA);
1780#endif
1781
1782	/* record BSP in CPU map */
1783	all_cpus = 1;
1784
1785	/* set up 0 -> 4MB P==V mapping for AP boot */
1786	*(int *)PTD = PG_V | PG_RW | ((uintptr_t)(void *)KPTphys & PG_FRAME);
1787	invltlb();
1788
1789	/* start each AP */
1790	for (x = 1; x <= mp_naps; ++x) {
1791
1792		/* This is a bit verbose, it will go away soon.  */
1793
1794		/* first page of AP's private space */
1795		pg = x * i386_btop(sizeof(struct privatespace));
1796
1797		/* allocate a new private data page */
1798		gd = (struct globaldata *)kmem_alloc(kernel_map, PAGE_SIZE);
1799
1800		/* wire it into the private page table page */
1801		SMPpt[pg] = (pt_entry_t)(PG_V | PG_RW | vtophys(gd));
1802
1803		/* allocate and set up an idle stack data page */
1804		stack = (char *)kmem_alloc(kernel_map, UPAGES*PAGE_SIZE);
1805		for (i = 0; i < UPAGES; i++)
1806			SMPpt[pg + 5 + i] = (pt_entry_t)
1807			    (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack));
1808
1809		SMPpt[pg + 1] = 0;		/* *prv_CMAP1 */
1810		SMPpt[pg + 2] = 0;		/* *prv_CMAP2 */
1811		SMPpt[pg + 3] = 0;		/* *prv_CMAP3 */
1812		SMPpt[pg + 4] = 0;		/* *prv_PMAP1 */
1813
1814		/* prime data page for it to use */
1815		gd->gd_cpuid = x;
1816		gd->gd_cpu_lockid = x << 24;
1817		gd->gd_prv_CMAP1 = &SMPpt[pg + 1];
1818		gd->gd_prv_CMAP2 = &SMPpt[pg + 2];
1819		gd->gd_prv_CMAP3 = &SMPpt[pg + 3];
1820		gd->gd_prv_PMAP1 = &SMPpt[pg + 4];
1821		gd->gd_prv_CADDR1 = SMP_prvspace[x].CPAGE1;
1822		gd->gd_prv_CADDR2 = SMP_prvspace[x].CPAGE2;
1823		gd->gd_prv_CADDR3 = SMP_prvspace[x].CPAGE3;
1824		gd->gd_prv_PADDR1 = (unsigned *)SMP_prvspace[x].PPAGE1;
1825
1826		/* setup a vector to our boot code */
1827		*((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
1828		*((volatile u_short *) WARMBOOT_SEG) = (boot_addr >> 4);
1829#ifndef PC98
1830		outb(CMOS_REG, BIOS_RESET);
1831		outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
1832#endif
1833
1834		bootSTK = &SMP_prvspace[x].idlestack[UPAGES*PAGE_SIZE];
1835		boot_cpuid = x;
1836
1837		/* attempt to start the Application Processor */
1838		CHECK_INIT(99);	/* setup checkpoints */
1839		if (!start_ap(x, boot_addr)) {
1840			printf("AP #%d (PHY# %d) failed!\n", x, CPU_TO_ID(x));
1841			CHECK_PRINT("trace");	/* show checkpoints */
1842			/* better panic as the AP may be running loose */
1843			printf("panic y/n? [y] ");
1844			if (cngetc() != 'n')
1845				panic("bye-bye");
1846		}
1847		CHECK_PRINT("trace");		/* show checkpoints */
1848
1849		/* record its version info */
1850		cpu_apic_versions[x] = cpu_apic_versions[0];
1851
1852		all_cpus |= (1 << x);		/* record AP in CPU map */
1853	}
1854
1855	/* build our map of 'other' CPUs */
1856	other_cpus = all_cpus & ~(1 << cpuid);
1857
1858	/* fill in our (BSP) APIC version */
1859	cpu_apic_versions[0] = lapic.version;
1860
1861	/* restore the warmstart vector */
1862	*(u_long *) WARMBOOT_OFF = mpbioswarmvec;
1863#ifndef PC98
1864	outb(CMOS_REG, BIOS_RESET);
1865	outb(CMOS_DATA, mpbiosreason);
1866#endif
1867
1868	/*
1869	 * Set up the idle context for the BSP.  Similar to above except
1870	 * that some was done by locore, some by pmap.c and some is implicit
1871	 * because the BSP is cpu#0 and the page is initially zero, and also
1872	 * because we can refer to variables by name on the BSP..
1873	 */
1874
1875	/* Allocate and setup BSP idle stack */
1876	stack = (char *)kmem_alloc(kernel_map, UPAGES * PAGE_SIZE);
1877	for (i = 0; i < UPAGES; i++)
1878		SMPpt[5 + i] = (pt_entry_t)
1879		    (PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack));
1880
1881	*(int *)PTD = 0;
1882	pmap_set_opt_bsp();
1883
1884	/* number of APs actually started */
1885	return mp_ncpus - 1;
1886}
1887
1888
1889/*
1890 * load the 1st level AP boot code into base memory.
1891 */
1892
1893/* targets for relocation */
1894extern void bigJump(void);
1895extern void bootCodeSeg(void);
1896extern void bootDataSeg(void);
1897extern void MPentry(void);
1898extern u_int MP_GDT;
1899extern u_int mp_gdtbase;
1900
1901static void
1902install_ap_tramp(u_int boot_addr)
1903{
1904	int     x;
1905	int     size = *(int *) ((u_long) & bootMP_size);
1906	u_char *src = (u_char *) ((u_long) bootMP);
1907	u_char *dst = (u_char *) boot_addr + KERNBASE;
1908	u_int   boot_base = (u_int) bootMP;
1909	u_int8_t *dst8;
1910	u_int16_t *dst16;
1911	u_int32_t *dst32;
1912
1913	POSTCODE(INSTALL_AP_TRAMP_POST);
1914
1915	for (x = 0; x < size; ++x)
1916		*dst++ = *src++;
1917
1918	/*
1919	 * modify addresses in code we just moved to basemem. unfortunately we
1920	 * need fairly detailed info about mpboot.s for this to work.  changes
1921	 * to mpboot.s might require changes here.
1922	 */
1923
1924	/* boot code is located in KERNEL space */
1925	dst = (u_char *) boot_addr + KERNBASE;
1926
1927	/* modify the lgdt arg */
1928	dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base));
1929	*dst32 = boot_addr + ((u_int) & MP_GDT - boot_base);
1930
1931	/* modify the ljmp target for MPentry() */
1932	dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1);
1933	*dst32 = ((u_int) MPentry - KERNBASE);
1934
1935	/* modify the target for boot code segment */
1936	dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base));
1937	dst8 = (u_int8_t *) (dst16 + 1);
1938	*dst16 = (u_int) boot_addr & 0xffff;
1939	*dst8 = ((u_int) boot_addr >> 16) & 0xff;
1940
1941	/* modify the target for boot data segment */
1942	dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base));
1943	dst8 = (u_int8_t *) (dst16 + 1);
1944	*dst16 = (u_int) boot_addr & 0xffff;
1945	*dst8 = ((u_int) boot_addr >> 16) & 0xff;
1946}
1947
1948
1949/*
1950 * this function starts the AP (application processor) identified
1951 * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
1952 * to accomplish this.  This is necessary because of the nuances
1953 * of the different hardware we might encounter.  It ain't pretty,
1954 * but it seems to work.
1955 */
1956static int
1957start_ap(int logical_cpu, u_int boot_addr)
1958{
1959	int     physical_cpu;
1960	int     vector;
1961	int     cpus;
1962	u_long  icr_lo, icr_hi;
1963
1964	POSTCODE(START_AP_POST);
1965
1966	/* get the PHYSICAL APIC ID# */
1967	physical_cpu = CPU_TO_ID(logical_cpu);
1968
1969	/* calculate the vector */
1970	vector = (boot_addr >> 12) & 0xff;
1971
1972	/* used as a watchpoint to signal AP startup */
1973	cpus = mp_ncpus;
1974
1975	/*
1976	 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
1977	 * and running the target CPU. OR this INIT IPI might be latched (P5
1978	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
1979	 * ignored.
1980	 */
1981
1982	/* setup the address for the target AP */
1983	icr_hi = lapic.icr_hi & ~APIC_ID_MASK;
1984	icr_hi |= (physical_cpu << 24);
1985	lapic.icr_hi = icr_hi;
1986
1987	/* do an INIT IPI: assert RESET */
1988	icr_lo = lapic.icr_lo & 0xfff00000;
1989	lapic.icr_lo = icr_lo | 0x0000c500;
1990
1991	/* wait for pending status end */
1992	while (lapic.icr_lo & APIC_DELSTAT_MASK)
1993		 /* spin */ ;
1994
1995	/* do an INIT IPI: deassert RESET */
1996	lapic.icr_lo = icr_lo | 0x00008500;
1997
1998	/* wait for pending status end */
1999	u_sleep(10000);		/* wait ~10mS */
2000	while (lapic.icr_lo & APIC_DELSTAT_MASK)
2001		 /* spin */ ;
2002
2003	/*
2004	 * next we do a STARTUP IPI: the previous INIT IPI might still be
2005	 * latched, (P5 bug) this 1st STARTUP would then terminate
2006	 * immediately, and the previously started INIT IPI would continue. OR
2007	 * the previous INIT IPI has already run. and this STARTUP IPI will
2008	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
2009	 * will run.
2010	 */
2011
2012	/* do a STARTUP IPI */
2013	lapic.icr_lo = icr_lo | 0x00000600 | vector;
2014	while (lapic.icr_lo & APIC_DELSTAT_MASK)
2015		 /* spin */ ;
2016	u_sleep(200);		/* wait ~200uS */
2017
2018	/*
2019	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
2020	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
2021	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
2022	 * recognized after hardware RESET or INIT IPI.
2023	 */
2024
2025	lapic.icr_lo = icr_lo | 0x00000600 | vector;
2026	while (lapic.icr_lo & APIC_DELSTAT_MASK)
2027		 /* spin */ ;
2028	u_sleep(200);		/* wait ~200uS */
2029
2030	/* wait for it to start */
2031	set_apic_timer(5000000);/* == 5 seconds */
2032	while (read_apic_timer())
2033		if (mp_ncpus > cpus)
2034			return 1;	/* return SUCCESS */
2035
2036	return 0;		/* return FAILURE */
2037}
2038
2039
2040/*
2041 * Flush the TLB on all other CPU's
2042 *
2043 * XXX: Needs to handshake and wait for completion before proceding.
2044 */
2045void
2046smp_invltlb(void)
2047{
2048#if defined(APIC_IO)
2049	if (smp_started && invltlb_ok)
2050		all_but_self_ipi(XINVLTLB_OFFSET);
2051#endif  /* APIC_IO */
2052}
2053
2054void
2055invlpg(u_int addr)
2056{
2057	__asm   __volatile("invlpg (%0)"::"r"(addr):"memory");
2058
2059	/* send a message to the other CPUs */
2060	smp_invltlb();
2061}
2062
2063void
2064invltlb(void)
2065{
2066	u_long  temp;
2067
2068	/*
2069	 * This should be implemented as load_cr3(rcr3()) when load_cr3() is
2070	 * inlined.
2071	 */
2072	__asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory");
2073
2074	/* send a message to the other CPUs */
2075	smp_invltlb();
2076}
2077
2078
2079/*
2080 * When called the executing CPU will send an IPI to all other CPUs
2081 *  requesting that they halt execution.
2082 *
2083 * Usually (but not necessarily) called with 'other_cpus' as its arg.
2084 *
2085 *  - Signals all CPUs in map to stop.
2086 *  - Waits for each to stop.
2087 *
2088 * Returns:
2089 *  -1: error
2090 *   0: NA
2091 *   1: ok
2092 *
2093 * XXX FIXME: this is not MP-safe, needs a lock to prevent multiple CPUs
2094 *            from executing at same time.
2095 */
2096int
2097stop_cpus(u_int map)
2098{
2099	if (!smp_started)
2100		return 0;
2101
2102	/* send the Xcpustop IPI to all CPUs in map */
2103	selected_apic_ipi(map, XCPUSTOP_OFFSET, APIC_DELMODE_FIXED);
2104
2105	while ((stopped_cpus & map) != map)
2106		/* spin */ ;
2107
2108	return 1;
2109}
2110
2111
2112/*
2113 * Called by a CPU to restart stopped CPUs.
2114 *
2115 * Usually (but not necessarily) called with 'stopped_cpus' as its arg.
2116 *
2117 *  - Signals all CPUs in map to restart.
2118 *  - Waits for each to restart.
2119 *
2120 * Returns:
2121 *  -1: error
2122 *   0: NA
2123 *   1: ok
2124 */
2125int
2126restart_cpus(u_int map)
2127{
2128	if (!smp_started)
2129		return 0;
2130
2131	started_cpus = map;		/* signal other cpus to restart */
2132
2133	while ((stopped_cpus & map) != 0) /* wait for each to clear its bit */
2134		/* spin */ ;
2135
2136	return 1;
2137}
2138
2139int smp_active = 0;	/* are the APs allowed to run? */
2140SYSCTL_INT(_machdep, OID_AUTO, smp_active, CTLFLAG_RW, &smp_active, 0, "");
2141
2142/* XXX maybe should be hw.ncpu */
2143static int smp_cpus = 1;	/* how many cpu's running */
2144SYSCTL_INT(_machdep, OID_AUTO, smp_cpus, CTLFLAG_RD, &smp_cpus, 0, "");
2145
2146int invltlb_ok = 0;	/* throttle smp_invltlb() till safe */
2147SYSCTL_INT(_machdep, OID_AUTO, invltlb_ok, CTLFLAG_RW, &invltlb_ok, 0, "");
2148
2149/* Warning: Do not staticize.  Used from swtch.s */
2150int do_page_zero_idle = 1; /* bzero pages for fun and profit in idleloop */
2151SYSCTL_INT(_machdep, OID_AUTO, do_page_zero_idle, CTLFLAG_RW,
2152	   &do_page_zero_idle, 0, "");
2153
2154/* Is forwarding of a interrupt to the CPU holding the ISR lock enabled ? */
2155int forward_irq_enabled = 1;
2156SYSCTL_INT(_machdep, OID_AUTO, forward_irq_enabled, CTLFLAG_RW,
2157	   &forward_irq_enabled, 0, "");
2158
2159/* Enable forwarding of a signal to a process running on a different CPU */
2160static int forward_signal_enabled = 1;
2161SYSCTL_INT(_machdep, OID_AUTO, forward_signal_enabled, CTLFLAG_RW,
2162	   &forward_signal_enabled, 0, "");
2163
2164/* Enable forwarding of roundrobin to all other cpus */
2165static int forward_roundrobin_enabled = 1;
2166SYSCTL_INT(_machdep, OID_AUTO, forward_roundrobin_enabled, CTLFLAG_RW,
2167	   &forward_roundrobin_enabled, 0, "");
2168
2169/*
2170 * This is called once the rest of the system is up and running and we're
2171 * ready to let the AP's out of the pen.
2172 */
2173void ap_init(void);
2174
2175void
2176ap_init()
2177{
2178	u_int	apic_id;
2179
2180	smp_cpus++;
2181
2182#if defined(I586_CPU) && !defined(NO_F00F_HACK)
2183	lidt(&r_idt);
2184#endif
2185
2186	/* Build our map of 'other' CPUs. */
2187	other_cpus = all_cpus & ~(1 << cpuid);
2188
2189	printf("SMP: AP CPU #%d Launched!\n", cpuid);
2190
2191	/* XXX FIXME: i386 specific, and redundant: Setup the FPU. */
2192	load_cr0((rcr0() & ~CR0_EM) | CR0_MP | CR0_NE | CR0_TS);
2193
2194	/* A quick check from sanity claus */
2195	apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]);
2196	if (cpuid != apic_id) {
2197		printf("SMP: cpuid = %d\n", cpuid);
2198		printf("SMP: apic_id = %d\n", apic_id);
2199		printf("PTD[MPPTDI] = %p\n", (void *)PTD[MPPTDI]);
2200		panic("cpuid mismatch! boom!!");
2201	}
2202
2203	/* Init local apic for irq's */
2204	apic_initialize();
2205
2206	/* Set memory range attributes for this CPU to match the BSP */
2207	mem_range_AP_init();
2208
2209	/*
2210	 * Activate smp_invltlb, although strictly speaking, this isn't
2211	 * quite correct yet.  We should have a bitfield for cpus willing
2212	 * to accept TLB flush IPI's or something and sync them.
2213	 */
2214	if (smp_cpus == mp_ncpus) {
2215		invltlb_ok = 1;
2216		smp_started = 1; /* enable IPI's, tlb shootdown, freezes etc */
2217		smp_active = 1;	 /* historic */
2218	}
2219}
2220
2221#ifdef BETTER_CLOCK
2222
2223#define CHECKSTATE_USER	0
2224#define CHECKSTATE_SYS	1
2225#define CHECKSTATE_INTR	2
2226
2227/* Do not staticize.  Used from apic_vector.s */
2228struct proc*	checkstate_curproc[NCPU];
2229int		checkstate_cpustate[NCPU];
2230u_long		checkstate_pc[NCPU];
2231
2232extern long	cp_time[CPUSTATES];
2233
2234#define PC_TO_INDEX(pc, prof)				\
2235        ((int)(((u_quad_t)((pc) - (prof)->pr_off) *	\
2236            (u_quad_t)((prof)->pr_scale)) >> 16) & ~1)
2237
2238static void
2239addupc_intr_forwarded(struct proc *p, int id, int *astmap)
2240{
2241	int i;
2242	struct uprof *prof;
2243	u_long pc;
2244
2245	pc = checkstate_pc[id];
2246	prof = &p->p_stats->p_prof;
2247	if (pc >= prof->pr_off &&
2248	    (i = PC_TO_INDEX(pc, prof)) < prof->pr_size) {
2249		if ((p->p_flag & P_OWEUPC) == 0) {
2250			prof->pr_addr = pc;
2251			prof->pr_ticks = 1;
2252			p->p_flag |= P_OWEUPC;
2253		}
2254		*astmap |= (1 << id);
2255	}
2256}
2257
2258static void
2259forwarded_statclock(int id, int pscnt, int *astmap)
2260{
2261	struct pstats *pstats;
2262	long rss;
2263	struct rusage *ru;
2264	struct vmspace *vm;
2265	int cpustate;
2266	struct proc *p;
2267#ifdef GPROF
2268	register struct gmonparam *g;
2269	int i;
2270#endif
2271
2272	p = checkstate_curproc[id];
2273	cpustate = checkstate_cpustate[id];
2274
2275	switch (cpustate) {
2276	case CHECKSTATE_USER:
2277		if (p->p_flag & P_PROFIL)
2278			addupc_intr_forwarded(p, id, astmap);
2279		if (pscnt > 1)
2280			return;
2281		p->p_uticks++;
2282		if (p->p_nice > NZERO)
2283			cp_time[CP_NICE]++;
2284		else
2285			cp_time[CP_USER]++;
2286		break;
2287	case CHECKSTATE_SYS:
2288#ifdef GPROF
2289		/*
2290		 * Kernel statistics are just like addupc_intr, only easier.
2291		 */
2292		g = &_gmonparam;
2293		if (g->state == GMON_PROF_ON) {
2294			i = checkstate_pc[id] - g->lowpc;
2295			if (i < g->textsize) {
2296				i /= HISTFRACTION * sizeof(*g->kcount);
2297				g->kcount[i]++;
2298			}
2299		}
2300#endif
2301		if (pscnt > 1)
2302			return;
2303
2304		if (!p)
2305			cp_time[CP_IDLE]++;
2306		else {
2307			p->p_sticks++;
2308			cp_time[CP_SYS]++;
2309		}
2310		break;
2311	case CHECKSTATE_INTR:
2312	default:
2313#ifdef GPROF
2314		/*
2315		 * Kernel statistics are just like addupc_intr, only easier.
2316		 */
2317		g = &_gmonparam;
2318		if (g->state == GMON_PROF_ON) {
2319			i = checkstate_pc[id] - g->lowpc;
2320			if (i < g->textsize) {
2321				i /= HISTFRACTION * sizeof(*g->kcount);
2322				g->kcount[i]++;
2323			}
2324		}
2325#endif
2326		if (pscnt > 1)
2327			return;
2328		if (p)
2329			p->p_iticks++;
2330		cp_time[CP_INTR]++;
2331	}
2332	if (p != NULL) {
2333		p->p_cpticks++;
2334		if (++p->p_estcpu == 0)
2335			p->p_estcpu--;
2336		if ((p->p_estcpu & 3) == 0) {
2337			resetpriority(p);
2338			if (p->p_priority >= PUSER)
2339				p->p_priority = p->p_usrpri;
2340		}
2341
2342		/* Update resource usage integrals and maximums. */
2343		if ((pstats = p->p_stats) != NULL &&
2344		    (ru = &pstats->p_ru) != NULL &&
2345		    (vm = p->p_vmspace) != NULL) {
2346			ru->ru_ixrss += pgtok(vm->vm_tsize);
2347			ru->ru_idrss += pgtok(vm->vm_dsize);
2348			ru->ru_isrss += pgtok(vm->vm_ssize);
2349			rss = pgtok(vmspace_resident_count(vm));
2350			if (ru->ru_maxrss < rss)
2351				ru->ru_maxrss = rss;
2352        	}
2353	}
2354}
2355
2356void
2357forward_statclock(int pscnt)
2358{
2359	int map;
2360	int id;
2361	int i;
2362
2363	/* Kludge. We don't yet have separate locks for the interrupts
2364	 * and the kernel. This means that we cannot let the other processors
2365	 * handle complex interrupts while inhibiting them from entering
2366	 * the kernel in a non-interrupt context.
2367	 *
2368	 * What we can do, without changing the locking mechanisms yet,
2369	 * is letting the other processors handle a very simple interrupt
2370	 * (wich determines the processor states), and do the main
2371	 * work ourself.
2372	 */
2373
2374	if (!smp_started || !invltlb_ok || cold || panicstr)
2375		return;
2376
2377	/* Step 1: Probe state   (user, cpu, interrupt, spinlock, idle ) */
2378
2379	map = other_cpus & ~stopped_cpus ;
2380	checkstate_probed_cpus = 0;
2381	if (map != 0)
2382		selected_apic_ipi(map,
2383				  XCPUCHECKSTATE_OFFSET, APIC_DELMODE_FIXED);
2384
2385	i = 0;
2386	while (checkstate_probed_cpus != map) {
2387		/* spin */
2388		i++;
2389		if (i == 100000) {
2390#ifdef BETTER_CLOCK_DIAGNOSTIC
2391			printf("forward_statclock: checkstate %x\n",
2392			       checkstate_probed_cpus);
2393#endif
2394			break;
2395		}
2396	}
2397
2398	/*
2399	 * Step 2: walk through other processors processes, update ticks and
2400	 * profiling info.
2401	 */
2402
2403	map = 0;
2404	for (id = 0; id < mp_ncpus; id++) {
2405		if (id == cpuid)
2406			continue;
2407		if (((1 << id) & checkstate_probed_cpus) == 0)
2408			continue;
2409		forwarded_statclock(id, pscnt, &map);
2410	}
2411	if (map != 0) {
2412		checkstate_need_ast |= map;
2413		selected_apic_ipi(map, XCPUAST_OFFSET, APIC_DELMODE_FIXED);
2414		i = 0;
2415		while ((checkstate_need_ast & map) != 0) {
2416			/* spin */
2417			i++;
2418			if (i > 100000) {
2419#ifdef BETTER_CLOCK_DIAGNOSTIC
2420				printf("forward_statclock: dropped ast 0x%x\n",
2421				       checkstate_need_ast & map);
2422#endif
2423				break;
2424			}
2425		}
2426	}
2427}
2428
2429void
2430forward_hardclock(int pscnt)
2431{
2432	int map;
2433	int id;
2434	struct proc *p;
2435	struct pstats *pstats;
2436	int i;
2437
2438	/* Kludge. We don't yet have separate locks for the interrupts
2439	 * and the kernel. This means that we cannot let the other processors
2440	 * handle complex interrupts while inhibiting them from entering
2441	 * the kernel in a non-interrupt context.
2442	 *
2443	 * What we can do, without changing the locking mechanisms yet,
2444	 * is letting the other processors handle a very simple interrupt
2445	 * (wich determines the processor states), and do the main
2446	 * work ourself.
2447	 */
2448
2449	if (!smp_started || !invltlb_ok || cold || panicstr)
2450		return;
2451
2452	/* Step 1: Probe state   (user, cpu, interrupt, spinlock, idle) */
2453
2454	map = other_cpus & ~stopped_cpus ;
2455	checkstate_probed_cpus = 0;
2456	if (map != 0)
2457		selected_apic_ipi(map,
2458				  XCPUCHECKSTATE_OFFSET, APIC_DELMODE_FIXED);
2459
2460	i = 0;
2461	while (checkstate_probed_cpus != map) {
2462		/* spin */
2463		i++;
2464		if (i == 100000) {
2465#ifdef BETTER_CLOCK_DIAGNOSTIC
2466			printf("forward_hardclock: checkstate %x\n",
2467			       checkstate_probed_cpus);
2468#endif
2469			break;
2470		}
2471	}
2472
2473	/*
2474	 * Step 2: walk through other processors processes, update virtual
2475	 * timer and profiling timer. If stathz == 0, also update ticks and
2476	 * profiling info.
2477	 */
2478
2479	map = 0;
2480	for (id = 0; id < mp_ncpus; id++) {
2481		if (id == cpuid)
2482			continue;
2483		if (((1 << id) & checkstate_probed_cpus) == 0)
2484			continue;
2485		p = checkstate_curproc[id];
2486		if (p) {
2487			pstats = p->p_stats;
2488			if (checkstate_cpustate[id] == CHECKSTATE_USER &&
2489			    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
2490			    itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) {
2491				psignal(p, SIGVTALRM);
2492				map |= (1 << id);
2493			}
2494			if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
2495			    itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) {
2496				psignal(p, SIGPROF);
2497				map |= (1 << id);
2498			}
2499		}
2500		if (stathz == 0) {
2501			forwarded_statclock( id, pscnt, &map);
2502		}
2503	}
2504	if (map != 0) {
2505		checkstate_need_ast |= map;
2506		selected_apic_ipi(map, XCPUAST_OFFSET, APIC_DELMODE_FIXED);
2507		i = 0;
2508		while ((checkstate_need_ast & map) != 0) {
2509			/* spin */
2510			i++;
2511			if (i > 100000) {
2512#ifdef BETTER_CLOCK_DIAGNOSTIC
2513				printf("forward_hardclock: dropped ast 0x%x\n",
2514				       checkstate_need_ast & map);
2515#endif
2516				break;
2517			}
2518		}
2519	}
2520}
2521
2522#endif /* BETTER_CLOCK */
2523
2524void
2525forward_signal(struct proc *p)
2526{
2527	int map;
2528	int id;
2529	int i;
2530
2531	/* Kludge. We don't yet have separate locks for the interrupts
2532	 * and the kernel. This means that we cannot let the other processors
2533	 * handle complex interrupts while inhibiting them from entering
2534	 * the kernel in a non-interrupt context.
2535	 *
2536	 * What we can do, without changing the locking mechanisms yet,
2537	 * is letting the other processors handle a very simple interrupt
2538	 * (wich determines the processor states), and do the main
2539	 * work ourself.
2540	 */
2541
2542	if (!smp_started || !invltlb_ok || cold || panicstr)
2543		return;
2544	if (!forward_signal_enabled)
2545		return;
2546	while (1) {
2547		if (p->p_stat != SRUN)
2548			return;
2549		id = p->p_oncpu;
2550		if (id == 0xff)
2551			return;
2552		map = (1<<id);
2553		checkstate_need_ast |= map;
2554		selected_apic_ipi(map, XCPUAST_OFFSET, APIC_DELMODE_FIXED);
2555		i = 0;
2556		while ((checkstate_need_ast & map) != 0) {
2557			/* spin */
2558			i++;
2559			if (i > 100000) {
2560#if 0
2561				printf("forward_signal: dropped ast 0x%x\n",
2562				       checkstate_need_ast & map);
2563#endif
2564				break;
2565			}
2566		}
2567		if (id == p->p_oncpu)
2568			return;
2569	}
2570}
2571
2572void
2573forward_roundrobin(void)
2574{
2575	u_int map;
2576	int i;
2577
2578	if (!smp_started || !invltlb_ok || cold || panicstr)
2579		return;
2580	if (!forward_roundrobin_enabled)
2581		return;
2582	resched_cpus |= other_cpus;
2583	map = other_cpus & ~stopped_cpus ;
2584#if 1
2585	selected_apic_ipi(map, XCPUAST_OFFSET, APIC_DELMODE_FIXED);
2586#else
2587	(void) all_but_self_ipi(XCPUAST_OFFSET);
2588#endif
2589	i = 0;
2590	while ((checkstate_need_ast & map) != 0) {
2591		/* spin */
2592		i++;
2593		if (i > 100000) {
2594#if 0
2595			printf("forward_roundrobin: dropped ast 0x%x\n",
2596			       checkstate_need_ast & map);
2597#endif
2598			break;
2599		}
2600	}
2601}
2602
2603
2604#ifdef APIC_INTR_REORDER
2605/*
2606 *	Maintain mapping from softintr vector to isr bit in local apic.
2607 */
2608void
2609set_lapic_isrloc(int intr, int vector)
2610{
2611	if (intr < 0 || intr > 32)
2612		panic("set_apic_isrloc: bad intr argument: %d",intr);
2613	if (vector < ICU_OFFSET || vector > 255)
2614		panic("set_apic_isrloc: bad vector argument: %d",vector);
2615	apic_isrbit_location[intr].location = &lapic.isr0 + ((vector>>5)<<2);
2616	apic_isrbit_location[intr].bit = (1<<(vector & 31));
2617}
2618#endif
2619