mp_x86.c revision 31639
1/*
2 * Copyright (c) 1996, by Steve Passe
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. The name of the developer may NOT be used to endorse or promote products
11 *    derived from this software without specific prior written permission.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 *	$Id: mp_machdep.c,v 1.40 1997/12/04 19:30:03 smp Exp smp $
26 */
27
28#include "opt_smp.h"
29#include "opt_vm86.h"
30
31#ifdef SMP
32#include <machine/smptests.h>
33#else
34#error
35#endif
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/kernel.h>
40#include <sys/proc.h>
41#include <sys/sysctl.h>
42#ifdef BETTER_CLOCK
43#include <sys/dkstat.h>
44#endif
45
46#include <vm/vm.h>
47#include <vm/vm_param.h>
48#include <vm/pmap.h>
49#include <vm/vm_kern.h>
50#include <vm/vm_extern.h>
51#ifdef BETTER_CLOCK
52#include <sys/lock.h>
53#include <vm/vm_map.h>
54#include <sys/user.h>
55#endif
56
57#include <machine/smp.h>
58#include <machine/apic.h>
59#include <machine/mpapic.h>
60#include <machine/segments.h>
61#include <machine/smptests.h>	/** TEST_DEFAULT_CONFIG, TEST_TEST1 */
62#include <machine/tss.h>
63#include <machine/specialreg.h>
64#include <machine/cputypes.h>
65
66#include <i386/i386/cons.h>	/* cngetc() */
67
68#if defined(APIC_IO)
69#include <machine/md_var.h>		/* setidt() */
70#include <i386/isa/icu.h>		/* IPIs */
71#include <i386/isa/intr_machdep.h>	/* IPIs */
72#endif	/* APIC_IO */
73
74#if defined(TEST_DEFAULT_CONFIG)
75#define MPFPS_MPFB1	TEST_DEFAULT_CONFIG
76#else
77#define MPFPS_MPFB1	mpfps->mpfb1
78#endif  /* TEST_DEFAULT_CONFIG */
79
80#define WARMBOOT_TARGET		0
81#define WARMBOOT_OFF		(KERNBASE + 0x0467)
82#define WARMBOOT_SEG		(KERNBASE + 0x0469)
83
84#define BIOS_BASE		(0xf0000)
85#define BIOS_SIZE		(0x10000)
86#define BIOS_COUNT		(BIOS_SIZE/4)
87
88#define CMOS_REG		(0x70)
89#define CMOS_DATA		(0x71)
90#define BIOS_RESET		(0x0f)
91#define BIOS_WARM		(0x0a)
92
93#define PROCENTRY_FLAG_EN	0x01
94#define PROCENTRY_FLAG_BP	0x02
95#define IOAPICENTRY_FLAG_EN	0x01
96
97
98/* MP Floating Pointer Structure */
99typedef struct MPFPS {
100	char    signature[4];
101	void   *pap;
102	u_char  length;
103	u_char  spec_rev;
104	u_char  checksum;
105	u_char  mpfb1;
106	u_char  mpfb2;
107	u_char  mpfb3;
108	u_char  mpfb4;
109	u_char  mpfb5;
110}      *mpfps_t;
111
112/* MP Configuration Table Header */
113typedef struct MPCTH {
114	char    signature[4];
115	u_short base_table_length;
116	u_char  spec_rev;
117	u_char  checksum;
118	u_char  oem_id[8];
119	u_char  product_id[12];
120	void   *oem_table_pointer;
121	u_short oem_table_size;
122	u_short entry_count;
123	void   *apic_address;
124	u_short extended_table_length;
125	u_char  extended_table_checksum;
126	u_char  reserved;
127}      *mpcth_t;
128
129
130typedef struct PROCENTRY {
131	u_char  type;
132	u_char  apic_id;
133	u_char  apic_version;
134	u_char  cpu_flags;
135	u_long  cpu_signature;
136	u_long  feature_flags;
137	u_long  reserved1;
138	u_long  reserved2;
139}      *proc_entry_ptr;
140
141typedef struct BUSENTRY {
142	u_char  type;
143	u_char  bus_id;
144	char    bus_type[6];
145}      *bus_entry_ptr;
146
147typedef struct IOAPICENTRY {
148	u_char  type;
149	u_char  apic_id;
150	u_char  apic_version;
151	u_char  apic_flags;
152	void   *apic_address;
153}      *io_apic_entry_ptr;
154
155typedef struct INTENTRY {
156	u_char  type;
157	u_char  int_type;
158	u_short int_flags;
159	u_char  src_bus_id;
160	u_char  src_bus_irq;
161	u_char  dst_apic_id;
162	u_char  dst_apic_int;
163}      *int_entry_ptr;
164
165/* descriptions of MP basetable entries */
166typedef struct BASETABLE_ENTRY {
167	u_char  type;
168	u_char  length;
169	char    name[16];
170}       basetable_entry;
171
172/*
173 * this code MUST be enabled here and in mpboot.s.
174 * it follows the very early stages of AP boot by placing values in CMOS ram.
175 * it NORMALLY will never be needed and thus the primitive method for enabling.
176 *
177#define CHECK_POINTS
178 */
179
180#if defined(CHECK_POINTS)
181#define CHECK_READ(A)	 (outb(CMOS_REG, (A)), inb(CMOS_DATA))
182#define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D)))
183
184#define CHECK_INIT(D);				\
185	CHECK_WRITE(0x34, (D));			\
186	CHECK_WRITE(0x35, (D));			\
187	CHECK_WRITE(0x36, (D));			\
188	CHECK_WRITE(0x37, (D));			\
189	CHECK_WRITE(0x38, (D));			\
190	CHECK_WRITE(0x39, (D));
191
192#define CHECK_PRINT(S);				\
193	printf("%s: %d, %d, %d, %d, %d, %d\n",	\
194	   (S),					\
195	   CHECK_READ(0x34),			\
196	   CHECK_READ(0x35),			\
197	   CHECK_READ(0x36),			\
198	   CHECK_READ(0x37),			\
199	   CHECK_READ(0x38),			\
200	   CHECK_READ(0x39));
201
202#else				/* CHECK_POINTS */
203
204#define CHECK_INIT(D)
205#define CHECK_PRINT(S)
206
207#endif				/* CHECK_POINTS */
208
209/*
210 * Values to send to the POST hardware.
211 */
212#define MP_BOOTADDRESS_POST	0x10
213#define MP_PROBE_POST		0x11
214#define MPTABLE_PASS1_POST	0x12
215
216#define MP_START_POST		0x13
217#define MP_ENABLE_POST		0x14
218#define MPTABLE_PASS2_POST	0x15
219
220#define START_ALL_APS_POST	0x16
221#define INSTALL_AP_TRAMP_POST	0x17
222#define START_AP_POST		0x18
223
224#define MP_ANNOUNCE_POST	0x19
225
226
227/** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */
228int	current_postcode;
229
230/** XXX FIXME: what system files declare these??? */
231extern struct region_descriptor r_gdt, r_idt;
232
233int	bsp_apic_ready = 0;	/* flags useability of BSP apic */
234int	mp_ncpus;		/* # of CPUs, including BSP */
235int	mp_naps;		/* # of Applications processors */
236int	mp_nbusses;		/* # of busses */
237int	mp_napics;		/* # of IO APICs */
238int	boot_cpu_id;		/* designated BSP */
239vm_offset_t cpu_apic_address;
240vm_offset_t io_apic_address[NAPICID];	/* NAPICID is more than enough */
241extern	int nkpt;
242
243u_int32_t cpu_apic_versions[NCPU];
244u_int32_t io_apic_versions[NAPIC];
245
246/*
247 * APIC ID logical/physical mapping structures.
248 * We oversize these to simplify boot-time config.
249 */
250int     cpu_num_to_apic_id[NAPICID];
251int     io_num_to_apic_id[NAPICID];
252int     apic_id_to_logical[NAPICID];
253
254
255#define NPPROVMTRR		8
256#define PPRO_VMTRRphysBase0	0x200
257#define PPRO_VMTRRphysMask0	0x201
258static struct {
259	u_int64_t base, mask;
260} PPro_vmtrr[NPPROVMTRR];
261
262/* Bitmap of all available CPUs */
263u_int	all_cpus;
264
265/* AP uses this PTD during bootstrap */
266pd_entry_t *bootPTD;
267
268/* Hotwire a 0->4MB V==P mapping */
269extern pt_entry_t *KPTphys;
270
271/* Virtual address of per-cpu common_tss */
272extern struct i386tss common_tss;
273#ifdef VM86
274extern struct segment_descriptor common_tssd;
275extern u_int private_tss;		/* flag indicating private tss */
276extern u_int my_tr;
277#endif /* VM86 */
278
279/* IdlePTD per cpu */
280pd_entry_t *IdlePTDS[NCPU];
281
282/* "my" private page table page, for BSP init */
283extern pt_entry_t SMP_prvpt[];
284
285/* Private page pointer to curcpu's PTD, used during BSP init */
286extern pd_entry_t *my_idlePTD;
287
288static int smp_started;		/* has the system started? */
289
290/*
291 * Local data and functions.
292 */
293
294static int	mp_capable;
295static u_int	boot_address;
296static u_int	base_memory;
297
298static int	picmode;		/* 0: virtual wire mode, 1: PIC mode */
299static mpfps_t	mpfps;
300static int	search_for_sig(u_int32_t target, int count);
301static void	mp_enable(u_int boot_addr);
302
303static int	mptable_pass1(void);
304static int	mptable_pass2(void);
305static void	default_mp_table(int type);
306static void	fix_mp_table(void);
307static void	init_locks(void);
308static int	start_all_aps(u_int boot_addr);
309static void	install_ap_tramp(u_int boot_addr);
310static int	start_ap(int logicalCpu, u_int boot_addr);
311static void	getmtrr(void);
312static void	putmtrr(void);
313static void	putfmtrr(void);
314
315
316/*
317 * Calculate usable address in base memory for AP trampoline code.
318 */
319u_int
320mp_bootaddress(u_int basemem)
321{
322	POSTCODE(MP_BOOTADDRESS_POST);
323
324	base_memory = basemem * 1024;	/* convert to bytes */
325
326	boot_address = base_memory & ~0xfff;	/* round down to 4k boundary */
327	if ((base_memory - boot_address) < bootMP_size)
328		boot_address -= 4096;	/* not enough, lower by 4k */
329
330	return boot_address;
331}
332
333
334/*
335 * Look for an Intel MP spec table (ie, SMP capable hardware).
336 */
337int
338mp_probe(void)
339{
340	int     x;
341	u_long  segment;
342	u_int32_t target;
343
344	POSTCODE(MP_PROBE_POST);
345
346	/* see if EBDA exists */
347	if (segment = (u_long) * (u_short *) (KERNBASE + 0x40e)) {
348		/* search first 1K of EBDA */
349		target = (u_int32_t) (segment << 4);
350		if ((x = search_for_sig(target, 1024 / 4)) >= 0)
351			goto found;
352	} else {
353		/* last 1K of base memory, effective 'top of base' passed in */
354		target = (u_int32_t) (base_memory - 0x400);
355		if ((x = search_for_sig(target, 1024 / 4)) >= 0)
356			goto found;
357	}
358
359	/* search the BIOS */
360	target = (u_int32_t) BIOS_BASE;
361	if ((x = search_for_sig(target, BIOS_COUNT)) >= 0)
362		goto found;
363
364	/* nothing found */
365	mpfps = (mpfps_t)0;
366	mp_capable = 0;
367	return 0;
368
369found:
370	/* calculate needed resources */
371	mpfps = (mpfps_t)x;
372	if (mptable_pass1())
373		panic("you must reconfigure your kernel");
374
375	/* flag fact that we are running multiple processors */
376	mp_capable = 1;
377	return 1;
378}
379
380
381/*
382 * Startup the SMP processors.
383 */
384void
385mp_start(void)
386{
387	POSTCODE(MP_START_POST);
388
389	/* look for MP capable motherboard */
390	if (mp_capable)
391		mp_enable(boot_address);
392	else
393		panic("MP hardware not found!");
394}
395
396
397/*
398 * Print various information about the SMP system hardware and setup.
399 */
400void
401mp_announce(void)
402{
403	int     x;
404
405	POSTCODE(MP_ANNOUNCE_POST);
406
407	printf("FreeBSD/SMP: Multiprocessor motherboard\n");
408	printf(" cpu0 (BSP): apic id: %2d", CPU_TO_ID(0));
409	printf(", version: 0x%08x", cpu_apic_versions[0]);
410	printf(", at 0x%08x\n", cpu_apic_address);
411	for (x = 1; x <= mp_naps; ++x) {
412		printf(" cpu%d (AP):  apic id: %2d", x, CPU_TO_ID(x));
413		printf(", version: 0x%08x", cpu_apic_versions[x]);
414		printf(", at 0x%08x\n", cpu_apic_address);
415	}
416
417#if defined(APIC_IO)
418	for (x = 0; x < mp_napics; ++x) {
419		printf(" io%d (APIC): apic id: %2d", x, IO_TO_ID(x));
420		printf(", version: 0x%08x", io_apic_versions[x]);
421		printf(", at 0x%08x\n", io_apic_address[x]);
422	}
423#else
424	printf(" Warning: APIC I/O disabled\n");
425#endif	/* APIC_IO */
426}
427
428/*
429 * AP cpu's call this to sync up protected mode.
430 */
431void
432init_secondary(void)
433{
434	int	gsel_tss;
435#ifndef VM86
436	u_int	my_tr;
437#endif
438
439	r_gdt.rd_limit = sizeof(gdt[0]) * (NGDT + NCPU) - 1;
440	r_gdt.rd_base = (int) gdt;
441	lgdt(&r_gdt);			/* does magic intra-segment return */
442	lidt(&r_idt);
443	lldt(_default_ldt);
444
445	my_tr = NGDT + cpuid;
446	gsel_tss = GSEL(my_tr, SEL_KPL);
447	gdt[my_tr].sd.sd_type = SDT_SYS386TSS;
448	common_tss.tss_esp0 = 0;	/* not used until after switch */
449	common_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
450	common_tss.tss_ioopt = (sizeof common_tss) << 16;
451#ifdef VM86
452	common_tssd = gdt[my_tr].sd;
453	private_tss = 0;
454#endif /* VM86 */
455	ltr(gsel_tss);
456
457	load_cr0(0x8005003b);		/* XXX! */
458
459	PTD[0] = 0;
460	pmap_set_opt((unsigned *)PTD);
461
462	putmtrr();
463	putfmtrr();
464
465	invltlb();
466}
467
468
469#if defined(APIC_IO)
470/*
471 * Final configuration of the BSP's local APIC:
472 *  - disable 'pic mode'.
473 *  - disable 'virtual wire mode'.
474 *  - enable NMI.
475 */
476void
477bsp_apic_configure(void)
478{
479	u_char		byte;
480	u_int32_t	temp;
481
482	/* leave 'pic mode' if necessary */
483	if (picmode) {
484		outb(0x22, 0x70);	/* select IMCR */
485		byte = inb(0x23);	/* current contents */
486		byte |= 0x01;		/* mask external INTR */
487		outb(0x23, byte);	/* disconnect 8259s/NMI */
488	}
489
490	/* mask lint0 (the 8259 'virtual wire' connection) */
491	temp = lapic.lvt_lint0;
492	temp |= APIC_LVT_M;		/* set the mask */
493	lapic.lvt_lint0 = temp;
494
495        /* setup lint1 to handle NMI */
496        temp = lapic.lvt_lint1;
497        temp &= ~APIC_LVT_M;		/* clear the mask */
498        lapic.lvt_lint1 = temp;
499
500	if (bootverbose)
501		apic_dump("bsp_apic_configure()");
502}
503#endif  /* APIC_IO */
504
505
506/*******************************************************************
507 * local functions and data
508 */
509
510/*
511 * start the SMP system
512 */
513static void
514mp_enable(u_int boot_addr)
515{
516	int     x;
517#if defined(APIC_IO)
518	int     apic;
519	u_int   ux;
520#endif	/* APIC_IO */
521
522	getmtrr();
523	putfmtrr();
524
525	POSTCODE(MP_ENABLE_POST);
526
527	/* turn on 4MB of V == P addressing so we can get to MP table */
528	*(int *)PTD = PG_V | PG_RW | ((u_long)KPTphys & PG_FRAME);
529	invltlb();
530
531	/* examine the MP table for needed info, uses physical addresses */
532	x = mptable_pass2();
533
534	*(int *)PTD = 0;
535	invltlb();
536
537	/* can't process default configs till the CPU APIC is pmapped */
538	if (x)
539		default_mp_table(x);
540
541	/* post scan cleanup */
542	fix_mp_table();
543
544#if defined(APIC_IO)
545
546	/* fill the LOGICAL io_apic_versions table */
547	for (apic = 0; apic < mp_napics; ++apic) {
548		ux = io_apic_read(apic, IOAPIC_VER);
549		io_apic_versions[apic] = ux;
550	}
551
552	/* program each IO APIC in the system */
553	for (apic = 0; apic < mp_napics; ++apic)
554		if (io_apic_setup(apic) < 0)
555			panic("IO APIC setup failure");
556
557	/* install a 'Spurious INTerrupt' vector */
558	setidt(XSPURIOUSINT_OFFSET, Xspuriousint,
559	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
560
561	/* install an inter-CPU IPI for TLB invalidation */
562	setidt(XINVLTLB_OFFSET, Xinvltlb,
563	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
564
565#ifdef BETTER_CLOCK
566	/* install an inter-CPU IPI for reading processor state */
567	setidt(XCPUCHECKSTATE_OFFSET, Xcpucheckstate,
568	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
569
570	/* install an inter-CPU IPI for forcing an additional software trap */
571	setidt(XCPUAST_OFFSET, Xcpuast,
572	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
573#endif
574
575	/* install an inter-CPU IPI for CPU stop/restart */
576	setidt(XCPUSTOP_OFFSET, Xcpustop,
577	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
578
579#if defined(TEST_TEST1)
580	/* install a "fake hardware INTerrupt" vector */
581	setidt(XTEST1_OFFSET, Xtest1,
582	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
583#endif  /** TEST_TEST1 */
584
585#endif	/* APIC_IO */
586
587	/* initialize all SMP locks */
588	init_locks();
589
590	/* start each Application Processor */
591	start_all_aps(boot_addr);
592
593	/*
594	 * The init process might be started on a different CPU now,
595	 * and the boot CPU might not call prepare_usermode to get
596	 * cr0 correctly configured. Thus we initialize cr0 here.
597	 */
598	load_cr0(rcr0() | CR0_WP | CR0_AM);
599}
600
601
602/*
603 * look for the MP spec signature
604 */
605
606/* string defined by the Intel MP Spec as identifying the MP table */
607#define MP_SIG		0x5f504d5f	/* _MP_ */
608#define NEXT(X)		((X) += 4)
609static int
610search_for_sig(u_int32_t target, int count)
611{
612	int     x;
613	u_int32_t *addr = (u_int32_t *) (KERNBASE + target);
614
615	for (x = 0; x < count; NEXT(x))
616		if (addr[x] == MP_SIG)
617			/* make array index a byte index */
618			return (target + (x * sizeof(u_int32_t)));
619
620	return -1;
621}
622
623
624static basetable_entry basetable_entry_types[] =
625{
626	{0, 20, "Processor"},
627	{1, 8, "Bus"},
628	{2, 8, "I/O APIC"},
629	{3, 8, "I/O INT"},
630	{4, 8, "Local INT"}
631};
632
633typedef struct BUSDATA {
634	u_char  bus_id;
635	enum busTypes bus_type;
636}       bus_datum;
637
638typedef struct INTDATA {
639	u_char  int_type;
640	u_short int_flags;
641	u_char  src_bus_id;
642	u_char  src_bus_irq;
643	u_char  dst_apic_id;
644	u_char  dst_apic_int;
645}       io_int, local_int;
646
647typedef struct BUSTYPENAME {
648	u_char  type;
649	char    name[7];
650}       bus_type_name;
651
652static bus_type_name bus_type_table[] =
653{
654	{CBUS, "CBUS"},
655	{CBUSII, "CBUSII"},
656	{EISA, "EISA"},
657	{UNKNOWN_BUSTYPE, "---"},
658	{UNKNOWN_BUSTYPE, "---"},
659	{ISA, "ISA"},
660	{UNKNOWN_BUSTYPE, "---"},
661	{UNKNOWN_BUSTYPE, "---"},
662	{UNKNOWN_BUSTYPE, "---"},
663	{UNKNOWN_BUSTYPE, "---"},
664	{UNKNOWN_BUSTYPE, "---"},
665	{UNKNOWN_BUSTYPE, "---"},
666	{PCI, "PCI"},
667	{UNKNOWN_BUSTYPE, "---"},
668	{UNKNOWN_BUSTYPE, "---"},
669	{UNKNOWN_BUSTYPE, "---"},
670	{UNKNOWN_BUSTYPE, "---"},
671	{XPRESS, "XPRESS"},
672	{UNKNOWN_BUSTYPE, "---"}
673};
674/* from MP spec v1.4, table 5-1 */
675static int default_data[7][5] =
676{
677/*   nbus, id0, type0, id1, type1 */
678	{1, 0, ISA, 255, 255},
679	{1, 0, EISA, 255, 255},
680	{1, 0, EISA, 255, 255},
681	{0, 255, 255, 255, 255},/* MCA not supported */
682	{2, 0, ISA, 1, PCI},
683	{2, 0, EISA, 1, PCI},
684	{0, 255, 255, 255, 255}	/* MCA not supported */
685};
686
687
688/* the bus data */
689bus_datum bus_data[NBUS];
690
691/* the IO INT data, one entry per possible APIC INTerrupt */
692io_int  io_apic_ints[NINTR];
693
694static int nintrs;
695
696static int processor_entry	__P((proc_entry_ptr entry, int cpu));
697static int bus_entry		__P((bus_entry_ptr entry, int bus));
698static int io_apic_entry	__P((io_apic_entry_ptr entry, int apic));
699static int int_entry		__P((int_entry_ptr entry, int intr));
700static int lookup_bus_type	__P((char *name));
701
702
703/*
704 * 1st pass on motherboard's Intel MP specification table.
705 *
706 * initializes:
707 *	mp_ncpus = 1
708 *
709 * determines:
710 *	cpu_apic_address (common to all CPUs)
711 *	io_apic_address[N]
712 *	mp_naps
713 *	mp_nbusses
714 *	mp_napics
715 *	nintrs
716 */
717static int
718mptable_pass1(void)
719{
720	int	x;
721	mpcth_t	cth;
722	int	totalSize;
723	void*	position;
724	int	count;
725	int	type;
726	int	mustpanic;
727
728	POSTCODE(MPTABLE_PASS1_POST);
729
730	mustpanic = 0;
731
732	/* clear various tables */
733	for (x = 0; x < NAPICID; ++x) {
734		io_apic_address[x] = ~0;	/* IO APIC address table */
735	}
736
737	/* init everything to empty */
738	mp_naps = 0;
739	mp_nbusses = 0;
740	mp_napics = 0;
741	nintrs = 0;
742
743	/* check for use of 'default' configuration */
744	if (MPFPS_MPFB1 != 0) {
745		/* use default addresses */
746		cpu_apic_address = DEFAULT_APIC_BASE;
747		io_apic_address[0] = DEFAULT_IO_APIC_BASE;
748
749		/* fill in with defaults */
750		mp_naps = 2;		/* includes BSP */
751		mp_nbusses = default_data[MPFPS_MPFB1 - 1][0];
752#if defined(APIC_IO)
753		mp_napics = 1;
754		nintrs = 16;
755#endif	/* APIC_IO */
756	}
757	else {
758		if ((cth = mpfps->pap) == 0)
759			panic("MP Configuration Table Header MISSING!");
760
761		cpu_apic_address = (vm_offset_t) cth->apic_address;
762
763		/* walk the table, recording info of interest */
764		totalSize = cth->base_table_length - sizeof(struct MPCTH);
765		position = (u_char *) cth + sizeof(struct MPCTH);
766		count = cth->entry_count;
767
768		while (count--) {
769			switch (type = *(u_char *) position) {
770			case 0: /* processor_entry */
771				if (((proc_entry_ptr)position)->cpu_flags
772					& PROCENTRY_FLAG_EN)
773					++mp_naps;
774				break;
775			case 1: /* bus_entry */
776				++mp_nbusses;
777				break;
778			case 2: /* io_apic_entry */
779				if (((io_apic_entry_ptr)position)->apic_flags
780					& IOAPICENTRY_FLAG_EN)
781					io_apic_address[mp_napics++] =
782					    (vm_offset_t)((io_apic_entry_ptr)
783						position)->apic_address;
784				break;
785			case 3: /* int_entry */
786				++nintrs;
787				break;
788			case 4:	/* int_entry */
789				break;
790			default:
791				panic("mpfps Base Table HOSED!");
792				/* NOTREACHED */
793			}
794
795			totalSize -= basetable_entry_types[type].length;
796			(u_char*)position += basetable_entry_types[type].length;
797		}
798	}
799
800	/* qualify the numbers */
801	if (mp_naps > NCPU)
802#if 0 /* XXX FIXME: kern/4255 */
803		printf("Warning: only using %d of %d available CPUs!\n",
804			NCPU, mp_naps);
805#else
806	{
807		printf("NCPU cannot be different than actual CPU count.\n");
808		printf(" add 'options NCPU=%d' to your kernel config file,\n",
809			mp_naps);
810		printf(" then rerun config & rebuild your SMP kernel\n");
811		mustpanic = 1;
812	}
813#endif /* XXX FIXME: kern/4255 */
814	if (mp_nbusses > NBUS) {
815		printf("found %d busses, increase NBUS\n", mp_nbusses);
816		mustpanic = 1;
817	}
818	if (mp_napics > NAPIC) {
819		printf("found %d apics, increase NAPIC\n", mp_napics);
820		mustpanic = 1;
821	}
822	if (nintrs > NINTR) {
823		printf("found %d intrs, increase NINTR\n", nintrs);
824		mustpanic = 1;
825	}
826
827	/*
828	 * Count the BSP.
829	 * This is also used as a counter while starting the APs.
830	 */
831	mp_ncpus = 1;
832
833	--mp_naps;	/* subtract the BSP */
834
835	return mustpanic;
836}
837
838
839/*
840 * 2nd pass on motherboard's Intel MP specification table.
841 *
842 * sets:
843 *	boot_cpu_id
844 *	ID_TO_IO(N), phy APIC ID to log CPU/IO table
845 *	CPU_TO_ID(N), logical CPU to APIC ID table
846 *	IO_TO_ID(N), logical IO to APIC ID table
847 *	bus_data[N]
848 *	io_apic_ints[N]
849 */
850static int
851mptable_pass2(void)
852{
853	int     x;
854	mpcth_t cth;
855	int     totalSize;
856	void*   position;
857	int     count;
858	int     type;
859	int     apic, bus, cpu, intr;
860
861	POSTCODE(MPTABLE_PASS2_POST);
862
863	/* clear various tables */
864	for (x = 0; x < NAPICID; ++x) {
865		ID_TO_IO(x) = -1;	/* phy APIC ID to log CPU/IO table */
866		CPU_TO_ID(x) = -1;	/* logical CPU to APIC ID table */
867		IO_TO_ID(x) = -1;	/* logical IO to APIC ID table */
868	}
869
870	/* clear bus data table */
871	for (x = 0; x < NBUS; ++x)
872		bus_data[x].bus_id = 0xff;
873
874	/* clear IO APIC INT table */
875	for (x = 0; x < NINTR; ++x)
876		io_apic_ints[x].int_type = 0xff;
877
878	/* setup the cpu/apic mapping arrays */
879	boot_cpu_id = -1;
880
881	/* record whether PIC or virtual-wire mode */
882	picmode = (mpfps->mpfb2 & 0x80) ? 1 : 0;
883
884	/* check for use of 'default' configuration */
885	if (MPFPS_MPFB1 != 0)
886		return MPFPS_MPFB1;	/* return default configuration type */
887
888	if ((cth = mpfps->pap) == 0)
889		panic("MP Configuration Table Header MISSING!");
890
891	/* walk the table, recording info of interest */
892	totalSize = cth->base_table_length - sizeof(struct MPCTH);
893	position = (u_char *) cth + sizeof(struct MPCTH);
894	count = cth->entry_count;
895	apic = bus = intr = 0;
896	cpu = 1;				/* pre-count the BSP */
897
898	while (count--) {
899		switch (type = *(u_char *) position) {
900		case 0:
901			if (processor_entry(position, cpu))
902				++cpu;
903			break;
904		case 1:
905			if (bus_entry(position, bus))
906				++bus;
907			break;
908		case 2:
909			if (io_apic_entry(position, apic))
910				++apic;
911			break;
912		case 3:
913			if (int_entry(position, intr))
914				++intr;
915			break;
916		case 4:
917			/* int_entry(position); */
918			break;
919		default:
920			panic("mpfps Base Table HOSED!");
921			/* NOTREACHED */
922		}
923
924		totalSize -= basetable_entry_types[type].length;
925		(u_char *) position += basetable_entry_types[type].length;
926	}
927
928	if (boot_cpu_id == -1)
929		panic("NO BSP found!");
930
931	/* report fact that its NOT a default configuration */
932	return 0;
933}
934
935
936/*
937 * parse an Intel MP specification table
938 */
939static void
940fix_mp_table(void)
941{
942	int	x;
943	int	id;
944	int	bus_0;
945	int	bus_pci;
946	int	num_pci_bus;
947
948	/*
949	 * Fix mis-numbering of the PCI bus and its INT entries if the BIOS
950	 * did it wrong.  The MP spec says that when more than 1 PCI bus
951	 * exists the BIOS must begin with bus entries for the PCI bus and use
952	 * actual PCI bus numbering.  This implies that when only 1 PCI bus
953	 * exists the BIOS can choose to ignore this ordering, and indeed many
954	 * MP motherboards do ignore it.  This causes a problem when the PCI
955	 * sub-system makes requests of the MP sub-system based on PCI bus
956	 * numbers.	So here we look for the situation and renumber the
957	 * busses and associated INTs in an effort to "make it right".
958	 */
959
960	/* find bus 0, PCI bus, count the number of PCI busses */
961	for (num_pci_bus = 0, x = 0; x < mp_nbusses; ++x) {
962		if (bus_data[x].bus_id == 0) {
963			bus_0 = x;
964		}
965		if (bus_data[x].bus_type == PCI) {
966			++num_pci_bus;
967			bus_pci = x;
968		}
969	}
970	/*
971	 * bus_0 == slot of bus with ID of 0
972	 * bus_pci == slot of last PCI bus encountered
973	 */
974
975	/* check the 1 PCI bus case for sanity */
976	if (num_pci_bus == 1) {
977
978		/* if it is number 0 all is well */
979		if (bus_data[bus_pci].bus_id == 0)
980			return;
981
982		/* mis-numbered, swap with whichever bus uses slot 0 */
983
984		/* swap the bus entry types */
985		bus_data[bus_pci].bus_type = bus_data[bus_0].bus_type;
986		bus_data[bus_0].bus_type = PCI;
987
988		/* swap each relavant INTerrupt entry */
989		id = bus_data[bus_pci].bus_id;
990		for (x = 0; x < nintrs; ++x) {
991			if (io_apic_ints[x].src_bus_id == id) {
992				io_apic_ints[x].src_bus_id = 0;
993			}
994			else if (io_apic_ints[x].src_bus_id == 0) {
995				io_apic_ints[x].src_bus_id = id;
996			}
997		}
998	}
999	/* sanity check if more than 1 PCI bus */
1000	else if (num_pci_bus > 1) {
1001		for (x = 0; x < mp_nbusses; ++x) {
1002			if (bus_data[x].bus_type != PCI)
1003				continue;
1004			if (bus_data[x].bus_id >= num_pci_bus)
1005				panic("bad PCI bus numbering");
1006		}
1007	}
1008}
1009
1010
1011static int
1012processor_entry(proc_entry_ptr entry, int cpu)
1013{
1014	/* check for usability */
1015	if ((cpu >= NCPU) || !(entry->cpu_flags & PROCENTRY_FLAG_EN))
1016		return 0;
1017
1018	/* check for BSP flag */
1019	if (entry->cpu_flags & PROCENTRY_FLAG_BP) {
1020		boot_cpu_id = entry->apic_id;
1021		CPU_TO_ID(0) = entry->apic_id;
1022		ID_TO_CPU(entry->apic_id) = 0;
1023		return 0;	/* its already been counted */
1024	}
1025
1026	/* add another AP to list, if less than max number of CPUs */
1027	else {
1028		CPU_TO_ID(cpu) = entry->apic_id;
1029		ID_TO_CPU(entry->apic_id) = cpu;
1030		return 1;
1031	}
1032}
1033
1034
1035static int
1036bus_entry(bus_entry_ptr entry, int bus)
1037{
1038	int     x;
1039	char    c, name[8];
1040
1041	/* encode the name into an index */
1042	for (x = 0; x < 6; ++x) {
1043		if ((c = entry->bus_type[x]) == ' ')
1044			break;
1045		name[x] = c;
1046	}
1047	name[x] = '\0';
1048
1049	if ((x = lookup_bus_type(name)) == UNKNOWN_BUSTYPE)
1050		panic("unknown bus type: '%s'", name);
1051
1052	bus_data[bus].bus_id = entry->bus_id;
1053	bus_data[bus].bus_type = x;
1054
1055	return 1;
1056}
1057
1058
1059static int
1060io_apic_entry(io_apic_entry_ptr entry, int apic)
1061{
1062	if (!(entry->apic_flags & IOAPICENTRY_FLAG_EN))
1063		return 0;
1064
1065	IO_TO_ID(apic) = entry->apic_id;
1066	ID_TO_IO(entry->apic_id) = apic;
1067
1068	return 1;
1069}
1070
1071
1072static int
1073lookup_bus_type(char *name)
1074{
1075	int     x;
1076
1077	for (x = 0; x < MAX_BUSTYPE; ++x)
1078		if (strcmp(bus_type_table[x].name, name) == 0)
1079			return bus_type_table[x].type;
1080
1081	return UNKNOWN_BUSTYPE;
1082}
1083
1084
1085static int
1086int_entry(int_entry_ptr entry, int intr)
1087{
1088	io_apic_ints[intr].int_type = entry->int_type;
1089	io_apic_ints[intr].int_flags = entry->int_flags;
1090	io_apic_ints[intr].src_bus_id = entry->src_bus_id;
1091	io_apic_ints[intr].src_bus_irq = entry->src_bus_irq;
1092	io_apic_ints[intr].dst_apic_id = entry->dst_apic_id;
1093	io_apic_ints[intr].dst_apic_int = entry->dst_apic_int;
1094
1095	return 1;
1096}
1097
1098
1099static int
1100apic_int_is_bus_type(int intr, int bus_type)
1101{
1102	int     bus;
1103
1104	for (bus = 0; bus < mp_nbusses; ++bus)
1105		if ((bus_data[bus].bus_id == io_apic_ints[intr].src_bus_id)
1106		    && ((int) bus_data[bus].bus_type == bus_type))
1107			return 1;
1108
1109	return 0;
1110}
1111
1112
1113/*
1114 * Given a traditional ISA INT mask, return an APIC mask.
1115 */
1116u_int
1117isa_apic_mask(u_int isa_mask)
1118{
1119	int isa_irq;
1120	int apic_pin;
1121
1122#if defined(SKIP_IRQ15_REDIRECT)
1123	if (isa_mask == (1 << 15)) {
1124		printf("skipping ISA IRQ15 redirect\n");
1125		return isa_mask;
1126	}
1127#endif  /* SKIP_IRQ15_REDIRECT */
1128
1129	isa_irq = ffs(isa_mask);		/* find its bit position */
1130	if (isa_irq == 0)			/* doesn't exist */
1131		return 0;
1132	--isa_irq;				/* make it zero based */
1133
1134	apic_pin = isa_apic_pin(isa_irq);	/* look for APIC connection */
1135	if (apic_pin == -1)
1136		return 0;
1137
1138	return (1 << apic_pin);			/* convert pin# to a mask */
1139}
1140
1141
1142/*
1143 * Determine which APIC pin an ISA/EISA INT is attached to.
1144 */
1145#define INTTYPE(I)	(io_apic_ints[(I)].int_type)
1146#define INTPIN(I)	(io_apic_ints[(I)].dst_apic_int)
1147
1148#define SRCBUSIRQ(I)	(io_apic_ints[(I)].src_bus_irq)
1149int
1150isa_apic_pin(int isa_irq)
1151{
1152	int     intr;
1153
1154	for (intr = 0; intr < nintrs; ++intr) {		/* check each record */
1155		if (INTTYPE(intr) == 0) {		/* standard INT */
1156			if (SRCBUSIRQ(intr) == isa_irq) {
1157				if (apic_int_is_bus_type(intr, ISA) ||
1158			            apic_int_is_bus_type(intr, EISA))
1159					return INTPIN(intr);	/* found */
1160			}
1161		}
1162	}
1163	return -1;					/* NOT found */
1164}
1165#undef SRCBUSIRQ
1166
1167
1168/*
1169 * Determine which APIC pin a PCI INT is attached to.
1170 */
1171#define SRCBUSID(I)	(io_apic_ints[(I)].src_bus_id)
1172#define SRCBUSDEVICE(I)	((io_apic_ints[(I)].src_bus_irq >> 2) & 0x1f)
1173#define SRCBUSLINE(I)	(io_apic_ints[(I)].src_bus_irq & 0x03)
1174int
1175pci_apic_pin(int pciBus, int pciDevice, int pciInt)
1176{
1177	int     intr;
1178
1179	--pciInt;					/* zero based */
1180
1181	for (intr = 0; intr < nintrs; ++intr)		/* check each record */
1182		if ((INTTYPE(intr) == 0)		/* standard INT */
1183		    && (SRCBUSID(intr) == pciBus)
1184		    && (SRCBUSDEVICE(intr) == pciDevice)
1185		    && (SRCBUSLINE(intr) == pciInt))	/* a candidate IRQ */
1186			if (apic_int_is_bus_type(intr, PCI))
1187				return INTPIN(intr);	/* exact match */
1188
1189	return -1;					/* NOT found */
1190}
1191#undef SRCBUSLINE
1192#undef SRCBUSDEVICE
1193#undef SRCBUSID
1194
1195#undef INTPIN
1196#undef INTTYPE
1197
1198
1199/*
1200 * Reprogram the MB chipset to NOT redirect an ISA INTerrupt.
1201 *
1202 * XXX FIXME:
1203 *  Exactly what this means is unclear at this point.  It is a solution
1204 *  for motherboards that redirect the MBIRQ0 pin.  Generically a motherboard
1205 *  could route any of the ISA INTs to upper (>15) IRQ values.  But most would
1206 *  NOT be redirected via MBIRQ0, thus "undirect()ing" them would NOT be an
1207 *  option.
1208 */
1209int
1210undirect_isa_irq(int rirq)
1211{
1212#if defined(READY)
1213	printf("Freeing redirected ISA irq %d.\n", rirq);
1214	/** FIXME: tickle the MB redirector chip */
1215	return ???;
1216#else
1217	printf("Freeing (NOT implemented) redirected ISA irq %d.\n", rirq);
1218	return 0;
1219#endif  /* READY */
1220}
1221
1222
1223/*
1224 * Reprogram the MB chipset to NOT redirect a PCI INTerrupt
1225 */
1226int
1227undirect_pci_irq(int rirq)
1228{
1229#if defined(READY)
1230	if (bootverbose)
1231		printf("Freeing redirected PCI irq %d.\n", rirq);
1232
1233	/** FIXME: tickle the MB redirector chip */
1234	return ???;
1235#else
1236	if (bootverbose)
1237		printf("Freeing (NOT implemented) redirected PCI irq %d.\n",
1238		       rirq);
1239	return 0;
1240#endif  /* READY */
1241}
1242
1243
1244/*
1245 * given a bus ID, return:
1246 *  the bus type if found
1247 *  -1 if NOT found
1248 */
1249int
1250apic_bus_type(int id)
1251{
1252	int     x;
1253
1254	for (x = 0; x < mp_nbusses; ++x)
1255		if (bus_data[x].bus_id == id)
1256			return bus_data[x].bus_type;
1257
1258	return -1;
1259}
1260
1261
1262/*
1263 * given a LOGICAL APIC# and pin#, return:
1264 *  the associated src bus ID if found
1265 *  -1 if NOT found
1266 */
1267int
1268apic_src_bus_id(int apic, int pin)
1269{
1270	int     x;
1271
1272	/* search each of the possible INTerrupt sources */
1273	for (x = 0; x < nintrs; ++x)
1274		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1275		    (pin == io_apic_ints[x].dst_apic_int))
1276			return (io_apic_ints[x].src_bus_id);
1277
1278	return -1;		/* NOT found */
1279}
1280
1281
1282/*
1283 * given a LOGICAL APIC# and pin#, return:
1284 *  the associated src bus IRQ if found
1285 *  -1 if NOT found
1286 */
1287int
1288apic_src_bus_irq(int apic, int pin)
1289{
1290	int     x;
1291
1292	for (x = 0; x < nintrs; x++)
1293		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1294		    (pin == io_apic_ints[x].dst_apic_int))
1295			return (io_apic_ints[x].src_bus_irq);
1296
1297	return -1;		/* NOT found */
1298}
1299
1300
1301/*
1302 * given a LOGICAL APIC# and pin#, return:
1303 *  the associated INTerrupt type if found
1304 *  -1 if NOT found
1305 */
1306int
1307apic_int_type(int apic, int pin)
1308{
1309	int     x;
1310
1311	/* search each of the possible INTerrupt sources */
1312	for (x = 0; x < nintrs; ++x)
1313		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1314		    (pin == io_apic_ints[x].dst_apic_int))
1315			return (io_apic_ints[x].int_type);
1316
1317	return -1;		/* NOT found */
1318}
1319
1320
1321/*
1322 * given a LOGICAL APIC# and pin#, return:
1323 *  the associated trigger mode if found
1324 *  -1 if NOT found
1325 */
1326int
1327apic_trigger(int apic, int pin)
1328{
1329	int     x;
1330
1331	/* search each of the possible INTerrupt sources */
1332	for (x = 0; x < nintrs; ++x)
1333		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1334		    (pin == io_apic_ints[x].dst_apic_int))
1335			return ((io_apic_ints[x].int_flags >> 2) & 0x03);
1336
1337	return -1;		/* NOT found */
1338}
1339
1340
1341/*
1342 * given a LOGICAL APIC# and pin#, return:
1343 *  the associated 'active' level if found
1344 *  -1 if NOT found
1345 */
1346int
1347apic_polarity(int apic, int pin)
1348{
1349	int     x;
1350
1351	/* search each of the possible INTerrupt sources */
1352	for (x = 0; x < nintrs; ++x)
1353		if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) &&
1354		    (pin == io_apic_ints[x].dst_apic_int))
1355			return (io_apic_ints[x].int_flags & 0x03);
1356
1357	return -1;		/* NOT found */
1358}
1359
1360
1361/*
1362 * set data according to MP defaults
1363 * FIXME: probably not complete yet...
1364 */
1365static void
1366default_mp_table(int type)
1367{
1368	int     ap_cpu_id;
1369#if defined(APIC_IO)
1370	u_int32_t ux;
1371	int     io_apic_id;
1372	int     pin;
1373#endif	/* APIC_IO */
1374
1375#if 0
1376	printf("  MP default config type: %d\n", type);
1377	switch (type) {
1378	case 1:
1379		printf("   bus: ISA, APIC: 82489DX\n");
1380		break;
1381	case 2:
1382		printf("   bus: EISA, APIC: 82489DX\n");
1383		break;
1384	case 3:
1385		printf("   bus: EISA, APIC: 82489DX\n");
1386		break;
1387	case 4:
1388		printf("   bus: MCA, APIC: 82489DX\n");
1389		break;
1390	case 5:
1391		printf("   bus: ISA+PCI, APIC: Integrated\n");
1392		break;
1393	case 6:
1394		printf("   bus: EISA+PCI, APIC: Integrated\n");
1395		break;
1396	case 7:
1397		printf("   bus: MCA+PCI, APIC: Integrated\n");
1398		break;
1399	default:
1400		printf("   future type\n");
1401		break;
1402		/* NOTREACHED */
1403	}
1404#endif	/* 0 */
1405
1406	boot_cpu_id = (lapic.id & APIC_ID_MASK) >> 24;
1407	ap_cpu_id = (boot_cpu_id == 0) ? 1 : 0;
1408
1409	/* BSP */
1410	CPU_TO_ID(0) = boot_cpu_id;
1411	ID_TO_CPU(boot_cpu_id) = 0;
1412
1413	/* one and only AP */
1414	CPU_TO_ID(1) = ap_cpu_id;
1415	ID_TO_CPU(ap_cpu_id) = 1;
1416
1417#if defined(APIC_IO)
1418	/* one and only IO APIC */
1419	io_apic_id = (io_apic_read(0, IOAPIC_ID) & APIC_ID_MASK) >> 24;
1420
1421	/*
1422	 * sanity check, refer to MP spec section 3.6.6, last paragraph
1423	 * necessary as some hardware isn't properly setting up the IO APIC
1424	 */
1425#if defined(REALLY_ANAL_IOAPICID_VALUE)
1426	if (io_apic_id != 2) {
1427#else
1428	if ((io_apic_id == 0) || (io_apic_id == 1) || (io_apic_id == 15)) {
1429#endif	/* REALLY_ANAL_IOAPICID_VALUE */
1430		ux = io_apic_read(0, IOAPIC_ID);	/* get current contents */
1431		ux &= ~APIC_ID_MASK;	/* clear the ID field */
1432		ux |= 0x02000000;	/* set it to '2' */
1433		io_apic_write(0, IOAPIC_ID, ux);	/* write new value */
1434		ux = io_apic_read(0, IOAPIC_ID);	/* re-read && test */
1435		if ((ux & APIC_ID_MASK) != 0x02000000)
1436			panic("can't control IO APIC ID, reg: 0x%08x", ux);
1437		io_apic_id = 2;
1438	}
1439	IO_TO_ID(0) = io_apic_id;
1440	ID_TO_IO(io_apic_id) = 0;
1441#endif	/* APIC_IO */
1442
1443	/* fill out bus entries */
1444	switch (type) {
1445	case 1:
1446	case 2:
1447	case 3:
1448	case 5:
1449	case 6:
1450		bus_data[0].bus_id = default_data[type - 1][1];
1451		bus_data[0].bus_type = default_data[type - 1][2];
1452		bus_data[1].bus_id = default_data[type - 1][3];
1453		bus_data[1].bus_type = default_data[type - 1][4];
1454		break;
1455
1456	/* case 4: case 7:		   MCA NOT supported */
1457	default:		/* illegal/reserved */
1458		panic("BAD default MP config: %d", type);
1459		/* NOTREACHED */
1460	}
1461
1462#if defined(APIC_IO)
1463	/* general cases from MP v1.4, table 5-2 */
1464	for (pin = 0; pin < 16; ++pin) {
1465		io_apic_ints[pin].int_type = 0;
1466		io_apic_ints[pin].int_flags = 0x05;	/* edge/active-hi */
1467		io_apic_ints[pin].src_bus_id = 0;
1468		io_apic_ints[pin].src_bus_irq = pin;	/* IRQ2 caught below */
1469		io_apic_ints[pin].dst_apic_id = io_apic_id;
1470		io_apic_ints[pin].dst_apic_int = pin;	/* 1-to-1 */
1471	}
1472
1473	/* special cases from MP v1.4, table 5-2 */
1474	if (type == 2) {
1475		io_apic_ints[2].int_type = 0xff;	/* N/C */
1476		io_apic_ints[13].int_type = 0xff;	/* N/C */
1477#if !defined(APIC_MIXED_MODE)
1478		/** FIXME: ??? */
1479		panic("sorry, can't support type 2 default yet");
1480#endif	/* APIC_MIXED_MODE */
1481	}
1482	else
1483		io_apic_ints[2].src_bus_irq = 0;	/* ISA IRQ0 is on APIC INT 2 */
1484
1485	if (type == 7)
1486		io_apic_ints[0].int_type = 0xff;	/* N/C */
1487	else
1488		io_apic_ints[0].int_type = 3;	/* vectored 8259 */
1489#endif	/* APIC_IO */
1490}
1491
1492
1493/*
1494 * initialize all the SMP locks
1495 */
1496
1497/* critical region around IO APIC, apic_imen */
1498struct simplelock	imen_lock;
1499
1500/* critical region around splxx(), cpl, cml, cil, ipending */
1501struct simplelock	cpl_lock;
1502
1503/* Make FAST_INTR() routines sequential */
1504struct simplelock	fast_intr_lock;
1505
1506/* critical region around INTR() routines */
1507struct simplelock	intr_lock;
1508
1509/* lock regions protected in UP kernel via cli/sti */
1510struct simplelock	mpintr_lock;
1511
1512#ifdef USE_COMLOCK
1513/* locks com (tty) data/hardware accesses: a FASTINTR() */
1514struct simplelock	com_lock;
1515#endif /* USE_COMLOCK */
1516
1517#ifdef USE_CLOCKLOCK
1518/* lock regions around the clock hardware */
1519struct simplelock	clock_lock;
1520#endif /* USE_CLOCKLOCK */
1521
1522static void
1523init_locks(void)
1524{
1525	/*
1526	 * Get the initial mp_lock with a count of 1 for the BSP.
1527	 * This uses a LOGICAL cpu ID, ie BSP == 0.
1528	 */
1529	mp_lock = 0x00000001;
1530
1531	/* ISR uses its own "giant lock" */
1532	isr_lock = FREE_LOCK;
1533
1534	s_lock_init((struct simplelock*)&mpintr_lock);
1535
1536	s_lock_init((struct simplelock*)&fast_intr_lock);
1537	s_lock_init((struct simplelock*)&intr_lock);
1538	s_lock_init((struct simplelock*)&imen_lock);
1539	s_lock_init((struct simplelock*)&cpl_lock);
1540
1541#ifdef USE_COMLOCK
1542	s_lock_init((struct simplelock*)&com_lock);
1543#endif /* USE_COMLOCK */
1544#ifdef USE_CLOCKLOCK
1545	s_lock_init((struct simplelock*)&clock_lock);
1546#endif /* USE_CLOCKLOCK */
1547}
1548
1549
1550/*
1551 * start each AP in our list
1552 */
1553static int
1554start_all_aps(u_int boot_addr)
1555{
1556	int     x, i;
1557	u_char  mpbiosreason;
1558	u_long  mpbioswarmvec;
1559	pd_entry_t *newptd;
1560	pt_entry_t *newpt;
1561	int *newpp;
1562	char *stack;
1563	pd_entry_t	*myPTD;
1564
1565	POSTCODE(START_ALL_APS_POST);
1566
1567	/* initialize BSP's local APIC */
1568	apic_initialize();
1569	bsp_apic_ready = 1;
1570
1571	/* install the AP 1st level boot code */
1572	install_ap_tramp(boot_addr);
1573
1574
1575	/* save the current value of the warm-start vector */
1576	mpbioswarmvec = *((u_long *) WARMBOOT_OFF);
1577	outb(CMOS_REG, BIOS_RESET);
1578	mpbiosreason = inb(CMOS_DATA);
1579
1580	/* record BSP in CPU map */
1581	all_cpus = 1;
1582
1583	/* start each AP */
1584	for (x = 1; x <= mp_naps; ++x) {
1585
1586		/* This is a bit verbose, it will go away soon.  */
1587
1588		/* alloc new page table directory */
1589		newptd = (pd_entry_t *)(kmem_alloc(kernel_map, PAGE_SIZE));
1590
1591		/* Store the virtual PTD address for this CPU */
1592		IdlePTDS[x] = newptd;
1593
1594		/* clone currently active one (ie: IdlePTD) */
1595		bcopy(PTD, newptd, PAGE_SIZE);	/* inc prv page pde */
1596
1597		/* set up 0 -> 4MB P==V mapping for AP boot */
1598		newptd[0] = (pd_entry_t) (PG_V | PG_RW |
1599						((u_long)KPTphys & PG_FRAME));
1600
1601		/* store PTD for this AP's boot sequence */
1602		myPTD = (pd_entry_t *)vtophys(newptd);
1603
1604		/* alloc new page table page */
1605		newpt = (pt_entry_t *)(kmem_alloc(kernel_map, PAGE_SIZE));
1606
1607		/* set the new PTD's private page to point there */
1608		newptd[MPPTDI] = (pt_entry_t)(PG_V | PG_RW | vtophys(newpt));
1609
1610		/* install self referential entry */
1611		newptd[PTDPTDI] = (pd_entry_t)(PG_V | PG_RW | vtophys(newptd));
1612
1613		/* allocate a new private data page */
1614		newpp = (int *)kmem_alloc(kernel_map, PAGE_SIZE);
1615
1616		/* wire it into the private page table page */
1617		newpt[0] = (pt_entry_t)(PG_V | PG_RW | vtophys(newpp));
1618
1619		/* wire the ptp into itself for access */
1620		newpt[1] = (pt_entry_t)(PG_V | PG_RW | vtophys(newpt));
1621
1622		/* copy in the pointer to the local apic */
1623		newpt[2] = SMP_prvpt[2];
1624
1625		/* and the IO apic mapping[s] */
1626		for (i = 16; i < 32; i++)
1627			newpt[i] = SMP_prvpt[i];
1628
1629		/* allocate and set up an idle stack data page */
1630		stack = (char *)kmem_alloc(kernel_map, UPAGES*PAGE_SIZE);
1631		for (i = 0; i < UPAGES; i++)
1632			newpt[i + 3] = (pt_entry_t)(PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack));
1633
1634		newpt[3 + UPAGES] = 0;		/* *prv_CMAP1 */
1635		newpt[4 + UPAGES] = 0;		/* *prv_CMAP2 */
1636		newpt[5 + UPAGES] = 0;		/* *prv_CMAP3 */
1637
1638		/* prime data page for it to use */
1639		newpp[0] = x;			/* cpuid */
1640		newpp[1] = 0;			/* curproc */
1641		newpp[2] = 0;			/* curpcb */
1642		newpp[3] = 0;			/* npxproc */
1643		newpp[4] = 0;			/* runtime.tv_sec */
1644		newpp[5] = 0;			/* runtime.tv_usec */
1645		newpp[6] = x << 24;		/* cpu_lockid */
1646		newpp[7] = 0;			/* other_cpus */
1647		newpp[8] = (int)myPTD;		/* my_idlePTD */
1648		newpp[9] = 0;			/* ss_tpr */
1649		newpp[10] = (int)&newpt[3 + UPAGES];	/* prv_CMAP1 */
1650		newpp[11] = (int)&newpt[4 + UPAGES];	/* prv_CMAP2 */
1651		newpp[12] = (int)&newpt[5 + UPAGES];	/* prv_CMAP3 */
1652
1653		/* setup a vector to our boot code */
1654		*((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
1655		*((volatile u_short *) WARMBOOT_SEG) = (boot_addr >> 4);
1656		outb(CMOS_REG, BIOS_RESET);
1657		outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
1658
1659		bootPTD = myPTD;
1660		/* attempt to start the Application Processor */
1661		CHECK_INIT(99);	/* setup checkpoints */
1662		if (!start_ap(x, boot_addr)) {
1663			printf("AP #%d (PHY# %d) failed!\n", x, CPU_TO_ID(x));
1664			CHECK_PRINT("trace");	/* show checkpoints */
1665			/* better panic as the AP may be running loose */
1666			printf("panic y/n? [y] ");
1667			if (cngetc() != 'n')
1668				panic("bye-bye");
1669		}
1670		CHECK_PRINT("trace");		/* show checkpoints */
1671
1672		/* record its version info */
1673		cpu_apic_versions[x] = cpu_apic_versions[0];
1674
1675		all_cpus |= (1 << x);		/* record AP in CPU map */
1676	}
1677
1678	/* build our map of 'other' CPUs */
1679	other_cpus = all_cpus & ~(1 << cpuid);
1680
1681	/* fill in our (BSP) APIC version */
1682	cpu_apic_versions[0] = lapic.version;
1683
1684	/* restore the warmstart vector */
1685	*(u_long *) WARMBOOT_OFF = mpbioswarmvec;
1686	outb(CMOS_REG, BIOS_RESET);
1687	outb(CMOS_DATA, mpbiosreason);
1688
1689	/*
1690	 * Set up the idle context for the BSP.  Similar to above except
1691	 * that some was done by locore, some by pmap.c and some is implicit
1692	 * because the BSP is cpu#0 and the page is initially zero, and also
1693	 * because we can refer to variables by name on the BSP..
1694	 */
1695	newptd = (pd_entry_t *)(kmem_alloc(kernel_map, PAGE_SIZE));
1696
1697	bcopy(PTD, newptd, PAGE_SIZE);	/* inc prv page pde */
1698	IdlePTDS[0] = newptd;
1699
1700	/* Point PTD[] to this page instead of IdlePTD's physical page */
1701	newptd[PTDPTDI] = (pd_entry_t)(PG_V | PG_RW | vtophys(newptd));
1702
1703	my_idlePTD = (pd_entry_t *)vtophys(newptd);
1704
1705	/* Allocate and setup BSP idle stack */
1706	stack = (char *)kmem_alloc(kernel_map, UPAGES * PAGE_SIZE);
1707	for (i = 0; i < UPAGES; i++)
1708		SMP_prvpt[i + 3] = (pt_entry_t)(PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack));
1709
1710	pmap_set_opt_bsp();
1711
1712	for (i = 0; i < mp_ncpus; i++) {
1713		bcopy( (int *) PTD + KPTDI, (int *) IdlePTDS[i] + KPTDI, NKPDE * sizeof (int));
1714	}
1715
1716	/* number of APs actually started */
1717	return mp_ncpus - 1;
1718}
1719
1720
1721/*
1722 * load the 1st level AP boot code into base memory.
1723 */
1724
1725/* targets for relocation */
1726extern void bigJump(void);
1727extern void bootCodeSeg(void);
1728extern void bootDataSeg(void);
1729extern void MPentry(void);
1730extern u_int MP_GDT;
1731extern u_int mp_gdtbase;
1732
1733static void
1734install_ap_tramp(u_int boot_addr)
1735{
1736	int     x;
1737	int     size = *(int *) ((u_long) & bootMP_size);
1738	u_char *src = (u_char *) ((u_long) bootMP);
1739	u_char *dst = (u_char *) boot_addr + KERNBASE;
1740	u_int   boot_base = (u_int) bootMP;
1741	u_int8_t *dst8;
1742	u_int16_t *dst16;
1743	u_int32_t *dst32;
1744
1745	POSTCODE(INSTALL_AP_TRAMP_POST);
1746
1747	for (x = 0; x < size; ++x)
1748		*dst++ = *src++;
1749
1750	/*
1751	 * modify addresses in code we just moved to basemem. unfortunately we
1752	 * need fairly detailed info about mpboot.s for this to work.  changes
1753	 * to mpboot.s might require changes here.
1754	 */
1755
1756	/* boot code is located in KERNEL space */
1757	dst = (u_char *) boot_addr + KERNBASE;
1758
1759	/* modify the lgdt arg */
1760	dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base));
1761	*dst32 = boot_addr + ((u_int) & MP_GDT - boot_base);
1762
1763	/* modify the ljmp target for MPentry() */
1764	dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1);
1765	*dst32 = ((u_int) MPentry - KERNBASE);
1766
1767	/* modify the target for boot code segment */
1768	dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base));
1769	dst8 = (u_int8_t *) (dst16 + 1);
1770	*dst16 = (u_int) boot_addr & 0xffff;
1771	*dst8 = ((u_int) boot_addr >> 16) & 0xff;
1772
1773	/* modify the target for boot data segment */
1774	dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base));
1775	dst8 = (u_int8_t *) (dst16 + 1);
1776	*dst16 = (u_int) boot_addr & 0xffff;
1777	*dst8 = ((u_int) boot_addr >> 16) & 0xff;
1778}
1779
1780
1781/*
1782 * this function starts the AP (application processor) identified
1783 * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
1784 * to accomplish this.  This is necessary because of the nuances
1785 * of the different hardware we might encounter.  It ain't pretty,
1786 * but it seems to work.
1787 */
1788static int
1789start_ap(int logical_cpu, u_int boot_addr)
1790{
1791	int     physical_cpu;
1792	int     vector;
1793	int     cpus;
1794	u_long  icr_lo, icr_hi;
1795
1796	POSTCODE(START_AP_POST);
1797
1798	/* get the PHYSICAL APIC ID# */
1799	physical_cpu = CPU_TO_ID(logical_cpu);
1800
1801	/* calculate the vector */
1802	vector = (boot_addr >> 12) & 0xff;
1803
1804	/* used as a watchpoint to signal AP startup */
1805	cpus = mp_ncpus;
1806
1807	/*
1808	 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
1809	 * and running the target CPU. OR this INIT IPI might be latched (P5
1810	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
1811	 * ignored.
1812	 */
1813
1814	/* setup the address for the target AP */
1815	icr_hi = lapic.icr_hi & ~APIC_ID_MASK;
1816	icr_hi |= (physical_cpu << 24);
1817	lapic.icr_hi = icr_hi;
1818
1819	/* do an INIT IPI: assert RESET */
1820	icr_lo = lapic.icr_lo & 0xfff00000;
1821	lapic.icr_lo = icr_lo | 0x0000c500;
1822
1823	/* wait for pending status end */
1824	while (lapic.icr_lo & APIC_DELSTAT_MASK)
1825		 /* spin */ ;
1826
1827	/* do an INIT IPI: deassert RESET */
1828	lapic.icr_lo = icr_lo | 0x00008500;
1829
1830	/* wait for pending status end */
1831	u_sleep(10000);		/* wait ~10mS */
1832	while (lapic.icr_lo & APIC_DELSTAT_MASK)
1833		 /* spin */ ;
1834
1835	/*
1836	 * next we do a STARTUP IPI: the previous INIT IPI might still be
1837	 * latched, (P5 bug) this 1st STARTUP would then terminate
1838	 * immediately, and the previously started INIT IPI would continue. OR
1839	 * the previous INIT IPI has already run. and this STARTUP IPI will
1840	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
1841	 * will run.
1842	 */
1843
1844	/* do a STARTUP IPI */
1845	lapic.icr_lo = icr_lo | 0x00000600 | vector;
1846	while (lapic.icr_lo & APIC_DELSTAT_MASK)
1847		 /* spin */ ;
1848	u_sleep(200);		/* wait ~200uS */
1849
1850	/*
1851	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
1852	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
1853	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
1854	 * recognized after hardware RESET or INIT IPI.
1855	 */
1856
1857	lapic.icr_lo = icr_lo | 0x00000600 | vector;
1858	while (lapic.icr_lo & APIC_DELSTAT_MASK)
1859		 /* spin */ ;
1860	u_sleep(200);		/* wait ~200uS */
1861
1862	/* wait for it to start */
1863	set_apic_timer(5000000);/* == 5 seconds */
1864	while (read_apic_timer())
1865		if (mp_ncpus > cpus)
1866			return 1;	/* return SUCCESS */
1867
1868	return 0;		/* return FAILURE */
1869}
1870
1871
1872/*
1873 * Flush the TLB on all other CPU's
1874 *
1875 * XXX: Needs to handshake and wait for completion before proceding.
1876 */
1877void
1878smp_invltlb(void)
1879{
1880#if defined(APIC_IO)
1881	if (smp_started && invltlb_ok)
1882		all_but_self_ipi(XINVLTLB_OFFSET);
1883#endif  /* APIC_IO */
1884}
1885
1886void
1887invlpg(u_int addr)
1888{
1889	__asm   __volatile("invlpg (%0)"::"r"(addr):"memory");
1890
1891	/* send a message to the other CPUs */
1892	smp_invltlb();
1893}
1894
1895void
1896invltlb(void)
1897{
1898	u_long  temp;
1899
1900	/*
1901	 * This should be implemented as load_cr3(rcr3()) when load_cr3() is
1902	 * inlined.
1903	 */
1904	__asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory");
1905
1906	/* send a message to the other CPUs */
1907	smp_invltlb();
1908}
1909
1910
1911/*
1912 * When called the executing CPU will send an IPI to all other CPUs
1913 *  requesting that they halt execution.
1914 *
1915 * Usually (but not necessarily) called with 'other_cpus' as its arg.
1916 *
1917 *  - Signals all CPUs in map to stop.
1918 *  - Waits for each to stop.
1919 *
1920 * Returns:
1921 *  -1: error
1922 *   0: NA
1923 *   1: ok
1924 *
1925 * XXX FIXME: this is not MP-safe, needs a lock to prevent multiple CPUs
1926 *            from executing at same time.
1927 */
1928int
1929stop_cpus(u_int map)
1930{
1931	if (!smp_started)
1932		return 0;
1933
1934	/* send IPI to all CPUs in map */
1935	stopped_cpus = 0;
1936
1937	/* send the Xcpustop IPI to all CPUs in map */
1938	selected_apic_ipi(map, XCPUSTOP_OFFSET, APIC_DELMODE_FIXED);
1939
1940	while (stopped_cpus != map)
1941		/* spin */ ;
1942
1943	return 1;
1944}
1945
1946
1947/*
1948 * Called by a CPU to restart stopped CPUs.
1949 *
1950 * Usually (but not necessarily) called with 'stopped_cpus' as its arg.
1951 *
1952 *  - Signals all CPUs in map to restart.
1953 *  - Waits for each to restart.
1954 *
1955 * Returns:
1956 *  -1: error
1957 *   0: NA
1958 *   1: ok
1959 */
1960int
1961restart_cpus(u_int map)
1962{
1963	if (!smp_started)
1964		return 0;
1965
1966	started_cpus = map;		/* signal other cpus to restart */
1967
1968	while (started_cpus)		/* wait for each to clear its bit */
1969		/* spin */ ;
1970
1971	return 1;
1972}
1973
1974int smp_active = 0;	/* are the APs allowed to run? */
1975SYSCTL_INT(_machdep, OID_AUTO, smp_active, CTLFLAG_RW, &smp_active, 0, "");
1976
1977/* XXX maybe should be hw.ncpu */
1978int smp_cpus = 1;	/* how many cpu's running */
1979SYSCTL_INT(_machdep, OID_AUTO, smp_cpus, CTLFLAG_RD, &smp_cpus, 0, "");
1980
1981int invltlb_ok = 0;	/* throttle smp_invltlb() till safe */
1982SYSCTL_INT(_machdep, OID_AUTO, invltlb_ok, CTLFLAG_RW, &invltlb_ok, 0, "");
1983
1984int do_page_zero_idle = 0; /* bzero pages for fun and profit in idleloop */
1985SYSCTL_INT(_machdep, OID_AUTO, do_page_zero_idle, CTLFLAG_RW,
1986	   &do_page_zero_idle, 0, "");
1987
1988
1989/*
1990 * This is called once the rest of the system is up and running and we're
1991 * ready to let the AP's out of the pen.
1992 */
1993void ap_init(void);
1994
1995void
1996ap_init()
1997{
1998	u_int   temp;
1999	u_int	apic_id;
2000
2001	smp_cpus++;
2002
2003	/* Build our map of 'other' CPUs. */
2004	other_cpus = all_cpus & ~(1 << cpuid);
2005
2006	printf("SMP: AP CPU #%d Launched!\n", cpuid);
2007
2008	/* XXX FIXME: i386 specific, and redundant: Setup the FPU. */
2009	load_cr0((rcr0() & ~CR0_EM) | CR0_MP | CR0_NE | CR0_TS);
2010
2011	/* A quick check from sanity claus */
2012	apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]);
2013	if (cpuid != apic_id) {
2014		printf("SMP: cpuid = %d\n", cpuid);
2015		printf("SMP: apic_id = %d\n", apic_id);
2016		printf("PTD[MPPTDI] = %08x\n", PTD[MPPTDI]);
2017		panic("cpuid mismatch! boom!!");
2018	}
2019
2020	/* Init local apic for irq's */
2021	apic_initialize();
2022
2023	/*
2024	 * Activate smp_invltlb, although strictly speaking, this isn't
2025	 * quite correct yet.  We should have a bitfield for cpus willing
2026	 * to accept TLB flush IPI's or something and sync them.
2027	 */
2028	invltlb_ok = 1;
2029	smp_started = 1;	/* enable IPI's, tlb shootdown, freezes etc */
2030	smp_active = 1;		/* historic */
2031
2032	curproc = NULL;		/* make sure */
2033}
2034
2035void
2036getmtrr()
2037{
2038	int i;
2039
2040	if (cpu_class == CPUCLASS_686) {
2041		for(i = 0; i < NPPROVMTRR; i++) {
2042			PPro_vmtrr[i].base = rdmsr(PPRO_VMTRRphysBase0 + i * 2);
2043			PPro_vmtrr[i].mask = rdmsr(PPRO_VMTRRphysMask0 + i * 2);
2044		}
2045	}
2046}
2047
2048void
2049putmtrr()
2050{
2051	int i;
2052
2053	if (cpu_class == CPUCLASS_686) {
2054		wbinvd();
2055		for(i = 0; i < NPPROVMTRR; i++) {
2056			wrmsr(PPRO_VMTRRphysBase0 + i * 2, PPro_vmtrr[i].base);
2057			wrmsr(PPRO_VMTRRphysMask0 + i * 2, PPro_vmtrr[i].mask);
2058		}
2059	}
2060}
2061
2062void
2063putfmtrr()
2064{
2065	if (cpu_class == CPUCLASS_686) {
2066		wbinvd();
2067		/*
2068		 * Set memory between 0-640K to be WB
2069		 */
2070		wrmsr(0x250, 0x0606060606060606LL);
2071		wrmsr(0x258, 0x0606060606060606LL);
2072		/*
2073		 * Set normal, PC video memory to be WC
2074		 */
2075		wrmsr(0x259, 0x0101010101010101LL);
2076	}
2077}
2078
2079
2080#ifdef BETTER_CLOCK
2081
2082#define CHECKSTATE_USER	0
2083#define CHECKSTATE_SYS	1
2084#define CHECKSTATE_INTR	2
2085
2086struct proc*	checkstate_curproc[NCPU];
2087int		checkstate_cpustate[NCPU];
2088u_long		checkstate_pc[NCPU];
2089
2090extern long	cp_time[CPUSTATES];
2091
2092#define PC_TO_INDEX(pc, prof)				\
2093        ((int)(((u_quad_t)((pc) - (prof)->pr_off) *	\
2094            (u_quad_t)((prof)->pr_scale)) >> 16) & ~1)
2095
2096static void
2097addupc_intr_forwarded(struct proc *p, int id, int *astmap)
2098{
2099	int i;
2100	struct uprof *prof;
2101	u_long pc;
2102
2103	pc = checkstate_pc[id];
2104	prof = &p->p_stats->p_prof;
2105	if (pc >= prof->pr_off &&
2106	    (i = PC_TO_INDEX(pc, prof)) < prof->pr_size) {
2107		if ((p->p_flag & P_OWEUPC) == 0) {
2108			prof->pr_addr = pc;
2109			prof->pr_ticks = 1;
2110			p->p_flag |= P_OWEUPC;
2111		}
2112		*astmap |= (1 << id);
2113	}
2114}
2115
2116static void
2117forwarded_statclock(int id, int pscnt, int *astmap)
2118{
2119	struct pstats *pstats;
2120	long rss;
2121	struct rusage *ru;
2122	struct vmspace *vm;
2123	int cpustate;
2124	struct proc *p;
2125#ifdef GPROF
2126	register struct gmonparam *g;
2127	int i;
2128#endif
2129
2130	p = checkstate_curproc[id];
2131	cpustate = checkstate_cpustate[id];
2132
2133	switch (cpustate) {
2134	case CHECKSTATE_USER:
2135		if (p->p_flag & P_PROFIL)
2136			addupc_intr_forwarded(p, id, astmap);
2137		if (pscnt > 1)
2138			return;
2139		p->p_uticks++;
2140		if (p->p_nice > NZERO)
2141			cp_time[CP_NICE]++;
2142		else
2143			cp_time[CP_USER]++;
2144		break;
2145	case CHECKSTATE_SYS:
2146#ifdef GPROF
2147		/*
2148		 * Kernel statistics are just like addupc_intr, only easier.
2149		 */
2150		g = &_gmonparam;
2151		if (g->state == GMON_PROF_ON) {
2152			i = checkstate_pc[id] - g->lowpc;
2153			if (i < g->textsize) {
2154				i /= HISTFRACTION * sizeof(*g->kcount);
2155				g->kcount[i]++;
2156			}
2157		}
2158#endif
2159		if (pscnt > 1)
2160			return;
2161
2162		if (!p)
2163			cp_time[CP_IDLE]++;
2164		else {
2165			p->p_sticks++;
2166			cp_time[CP_SYS]++;
2167		}
2168		break;
2169	case CHECKSTATE_INTR:
2170	default:
2171#ifdef GPROF
2172		/*
2173		 * Kernel statistics are just like addupc_intr, only easier.
2174		 */
2175		g = &_gmonparam;
2176		if (g->state == GMON_PROF_ON) {
2177			i = checkstate_pc[id] - g->lowpc;
2178			if (i < g->textsize) {
2179				i /= HISTFRACTION * sizeof(*g->kcount);
2180				g->kcount[i]++;
2181			}
2182		}
2183#endif
2184		if (pscnt > 1)
2185			return;
2186		if (p)
2187			p->p_iticks++;
2188		cp_time[CP_INTR]++;
2189	}
2190	if (p != NULL) {
2191		p->p_cpticks++;
2192		if (++p->p_estcpu == 0)
2193			p->p_estcpu--;
2194		if ((p->p_estcpu & 3) == 0) {
2195			resetpriority(p);
2196			if (p->p_priority >= PUSER)
2197				p->p_priority = p->p_usrpri;
2198		}
2199
2200		/* Update resource usage integrals and maximums. */
2201		if ((pstats = p->p_stats) != NULL &&
2202		    (ru = &pstats->p_ru) != NULL &&
2203		    (vm = p->p_vmspace) != NULL) {
2204			ru->ru_ixrss += vm->vm_tsize * PAGE_SIZE / 1024;
2205			ru->ru_idrss += vm->vm_dsize * PAGE_SIZE / 1024;
2206			ru->ru_isrss += vm->vm_ssize * PAGE_SIZE / 1024;
2207			rss = vm->vm_pmap.pm_stats.resident_count *
2208				PAGE_SIZE / 1024;
2209			if (ru->ru_maxrss < rss)
2210				ru->ru_maxrss = rss;
2211        	}
2212	}
2213}
2214
2215void
2216forward_statclock(int pscnt)
2217{
2218	int map;
2219	int id;
2220	int i;
2221
2222	/* Kludge. We don't yet have separate locks for the interrupts
2223	 * and the kernel. This means that we cannot let the other processors
2224	 * handle complex interrupts while inhibiting them from entering
2225	 * the kernel in a non-interrupt context.
2226	 *
2227	 * What we can do, without changing the locking mechanisms yet,
2228	 * is letting the other processors handle a very simple interrupt
2229	 * (wich determines the processor states), and do the main
2230	 * work ourself.
2231	 */
2232
2233	if (!smp_started || !invltlb_ok)
2234		return;
2235
2236	/* Step 1: Probe state   (user, cpu, interrupt, spinlock, idle ) */
2237
2238	map = other_cpus;
2239	checkstate_probed_cpus = 0;
2240	selected_apic_ipi(map, XCPUCHECKSTATE_OFFSET, APIC_DELMODE_FIXED);
2241
2242	i = 0;
2243	while (checkstate_probed_cpus != map) {
2244		/* spin */
2245		i++;
2246		if (i == 1000000) {
2247			printf("forward_statclock: checkstate %x\n",
2248			       checkstate_probed_cpus);
2249		}
2250	}
2251
2252	/*
2253	 * Step 2: walk through other processors processes, update ticks and
2254	 * profiling info.
2255	 */
2256
2257	map = 0;
2258	for (id = 0; id < mp_ncpus; id++) {
2259		if (id == cpuid)
2260			continue;
2261		if (((1 << id) & checkstate_probed_cpus) == 0)
2262			panic("state for cpu %d not available", cpuid);
2263		forwarded_statclock(id, pscnt, &map);
2264	}
2265	if (map != 0) {
2266		checkstate_need_ast |= map;
2267		selected_apic_ipi(map, XCPUAST_OFFSET, APIC_DELMODE_FIXED);
2268		i = 0;
2269		while (checkstate_need_ast != 0) {
2270			/* spin */
2271			i++;
2272			if (i > 1000000) {
2273				printf("forward_statclock: dropped ast 0x%x\n",
2274				       checkstate_need_ast);
2275				break;
2276			}
2277		}
2278	}
2279}
2280
2281void
2282forward_hardclock(int pscnt)
2283{
2284	int map;
2285	int id;
2286	struct proc *p;
2287	struct pstats *pstats;
2288	int i;
2289
2290	/* Kludge. We don't yet have separate locks for the interrupts
2291	 * and the kernel. This means that we cannot let the other processors
2292	 * handle complex interrupts while inhibiting them from entering
2293	 * the kernel in a non-interrupt context.
2294	 *
2295	 * What we can do, without changing the locking mechanisms yet,
2296	 * is letting the other processors handle a very simple interrupt
2297	 * (wich determines the processor states), and do the main
2298	 * work ourself.
2299	 */
2300
2301	if (!smp_started || !invltlb_ok)
2302		return;
2303
2304	/* Step 1: Probe state   (user, cpu, interrupt, spinlock, idle) */
2305
2306	map = other_cpus;
2307	checkstate_probed_cpus = 0;
2308	selected_apic_ipi(map, XCPUCHECKSTATE_OFFSET, APIC_DELMODE_FIXED);
2309
2310	i = 0;
2311	while (checkstate_probed_cpus != map) {
2312		/* spin */
2313		i++;
2314		if (i == 1000000) {
2315			printf("forward_hardclock: checkstate %x\n",
2316			       checkstate_probed_cpus);
2317		}
2318	}
2319
2320	/*
2321	 * Step 2: walk through other processors processes, update virtual
2322	 * timer and profiling timer. If stathz == 0, also update ticks and
2323	 * profiling info.
2324	 */
2325
2326	map = 0;
2327	for (id = 0; id < mp_ncpus; id++) {
2328		if (id == cpuid)
2329			continue;
2330		if (((1 << id) & checkstate_probed_cpus) == 0)
2331			panic("state for cpu %d not available", cpuid);
2332		p = checkstate_curproc[id];
2333		if (p) {
2334			pstats = p->p_stats;
2335			if (checkstate_cpustate[id] == CHECKSTATE_USER &&
2336			    timerisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
2337			    itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) {
2338				psignal(p, SIGVTALRM);
2339				map |= (1 << id);
2340			}
2341			if (timerisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
2342			    itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) {
2343				psignal(p, SIGPROF);
2344				map |= (1 << id);
2345			}
2346		}
2347		if (stathz == 0) {
2348			forwarded_statclock( id, pscnt, &map);
2349		}
2350	}
2351	if (map != 0) {
2352		checkstate_need_ast |= map;
2353		selected_apic_ipi(map, XCPUAST_OFFSET, APIC_DELMODE_FIXED);
2354		i = 0;
2355		while (checkstate_need_ast != 0) {
2356			/* spin */
2357			i++;
2358			if (i > 1000000) {
2359				printf("forward_hardclock: dropped ast 0x%x\n",
2360				       checkstate_need_ast);
2361				break;
2362			}
2363		}
2364	}
2365}
2366
2367#endif /* BETTER_CLOCK */
2368