1/*
2 *	x86 SMP booting functions
3 *
4 *	(c) 1995 Alan Cox, Building #3 <alan@redhat.com>
5 *	(c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
6 *
7 *	Much of the core SMP work is based on previous work by Thomas Radke, to
8 *	whom a great many thanks are extended.
9 *
10 *	Thanks to Intel for making available several different Pentium,
11 *	Pentium Pro and Pentium-II/Xeon MP machines.
12 *	Original development of Linux SMP code supported by Caldera.
13 *
14 *	This code is released under the GNU General Public License version 2 or
15 *	later.
16 *
17 *	Fixes
18 *		Felix Koop	:	NR_CPUS used properly
19 *		Jose Renau	:	Handle single CPU case.
20 *		Alan Cox	:	By repeated request 8) - Total BogoMIP report.
21 *		Greg Wright	:	Fix for kernel stacks panic.
22 *		Erich Boleyn	:	MP v1.4 and additional changes.
23 *	Matthias Sattler	:	Changes for 2.1 kernel map.
24 *	Michel Lespinasse	:	Changes for 2.1 kernel map.
25 *	Michael Chastain	:	Change trampoline.S to gnu as.
26 *		Alan Cox	:	Dumb bug: 'B' step PPro's are fine
27 *		Ingo Molnar	:	Added APIC timers, based on code
28 *					from Jose Renau
29 *		Ingo Molnar	:	various cleanups and rewrites
30 *		Tigran Aivazian	:	fixed "0.00 in /proc/uptime on SMP" bug.
31 *	Maciej W. Rozycki	:	Bits for genuine 82489DX APICs
32 *		Martin J. Bligh	: 	Added support for multi-quad systems
33 */
34
35#include <linux/config.h>
36#include <linux/init.h>
37
38#include <linux/mm.h>
39#include <linux/kernel_stat.h>
40#include <linux/smp_lock.h>
41#include <linux/irq.h>
42#include <linux/bootmem.h>
43
44#include <linux/delay.h>
45#include <linux/mc146818rtc.h>
46#include <asm/mtrr.h>
47#include <asm/pgalloc.h>
48#include <asm/smpboot.h>
49
50/* Set if we find a B stepping CPU			*/
51static int smp_b_stepping;
52
53/* Setup configured maximum number of CPUs to activate */
54static int max_cpus = -1;
55
56/* Total count of live CPUs */
57int smp_num_cpus = 1;
58
59/* Number of siblings per CPU package */
60int smp_num_siblings = 1;
61int __initdata phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */
62
63/* Bitmask of currently online CPUs */
64unsigned long cpu_online_map;
65
66static volatile unsigned long cpu_callin_map;
67static volatile unsigned long cpu_callout_map;
68
69/* Per CPU bogomips and other parameters */
70struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
71
72/* Set when the idlers are all forked */
73int smp_threads_ready;
74
75/*
76 * Setup routine for controlling SMP activation
77 *
78 * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
79 * activation entirely (the MPS table probe still happens, though).
80 *
81 * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
82 * greater than 0, limits the maximum number of CPUs activated in
83 * SMP mode to <NUM>.
84 */
85
86static int __init nosmp(char *str)
87{
88	max_cpus = 0;
89	return 1;
90}
91
92__setup("nosmp", nosmp);
93
94static int __init maxcpus(char *str)
95{
96	get_option(&str, &max_cpus);
97	return 1;
98}
99
100__setup("maxcpus=", maxcpus);
101
102/*
103 * Trampoline 80x86 program as an array.
104 */
105
106extern unsigned char trampoline_data [];
107extern unsigned char trampoline_end  [];
108static unsigned char *trampoline_base;
109
110/*
111 * Currently trivial. Write the real->protected mode
112 * bootstrap into the page concerned. The caller
113 * has made sure it's suitably aligned.
114 */
115
116static unsigned long __init setup_trampoline(void)
117{
118	memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data);
119	return virt_to_phys(trampoline_base);
120}
121
122/*
123 * We are called very early to get the low memory for the
124 * SMP bootup trampoline page.
125 */
126void __init smp_alloc_memory(void)
127{
128	trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE);
129	/*
130	 * Has to be in very low memory so we can execute
131	 * real-mode AP code.
132	 */
133	if (__pa(trampoline_base) >= 0x9F000)
134		BUG();
135}
136
137/*
138 * The bootstrap kernel entry code has set these up. Save them for
139 * a given CPU
140 */
141
142void __init smp_store_cpu_info(int id)
143{
144	struct cpuinfo_x86 *c = cpu_data + id;
145
146	*c = boot_cpu_data;
147	c->pte_quick = 0;
148	c->pmd_quick = 0;
149	c->pgd_quick = 0;
150	c->pgtable_cache_sz = 0;
151	identify_cpu(c);
152	/*
153	 * Mask B, Pentium, but not Pentium MMX
154	 */
155	if (c->x86_vendor == X86_VENDOR_INTEL &&
156	    c->x86 == 5 &&
157	    c->x86_mask >= 1 && c->x86_mask <= 4 &&
158	    c->x86_model <= 3)
159		/*
160		 * Remember we have B step Pentia with bugs
161		 */
162		smp_b_stepping = 1;
163}
164
165/*
166 * Architecture specific routine called by the kernel just before init is
167 * fired off. This allows the BP to have everything in order [we hope].
168 * At the end of this all the APs will hit the system scheduling and off
169 * we go. Each AP will load the system gdt's and jump through the kernel
170 * init into idle(). At this point the scheduler will one day take over
171 * and give them jobs to do. smp_callin is a standard routine
172 * we use to track CPUs as they power up.
173 */
174
175static atomic_t smp_commenced = ATOMIC_INIT(0);
176
177void __init smp_commence(void)
178{
179	/*
180	 * Lets the callins below out of their loop.
181	 */
182	Dprintk("Setting commenced=1, go go go\n");
183
184	wmb();
185	atomic_set(&smp_commenced,1);
186}
187
188/*
189 * TSC synchronization.
190 *
191 * We first check wether all CPUs have their TSC's synchronized,
192 * then we print a warning if not, and always resync.
193 */
194
195static atomic_t tsc_start_flag = ATOMIC_INIT(0);
196static atomic_t tsc_count_start = ATOMIC_INIT(0);
197static atomic_t tsc_count_stop = ATOMIC_INIT(0);
198static unsigned long long tsc_values[NR_CPUS];
199
200#define NR_LOOPS 5
201
202extern unsigned long fast_gettimeoffset_quotient;
203
204/*
205 * accurate 64-bit/32-bit division, expanded to 32-bit divisions and 64-bit
206 * multiplication. Not terribly optimized but we need it at boot time only
207 * anyway.
208 *
209 * result == a / b
210 *	== (a1 + a2*(2^32)) / b
211 *	== a1/b + a2*(2^32/b)
212 *	== a1/b + a2*((2^32-1)/b) + a2/b + (a2*((2^32-1) % b))/b
213 *		    ^---- (this multiplication can overflow)
214 */
215
216static unsigned long long __init div64 (unsigned long long a, unsigned long b0)
217{
218	unsigned int a1, a2;
219	unsigned long long res;
220
221	a1 = ((unsigned int*)&a)[0];
222	a2 = ((unsigned int*)&a)[1];
223
224	res = a1/b0 +
225		(unsigned long long)a2 * (unsigned long long)(0xffffffff/b0) +
226		a2 / b0 +
227		(a2 * (0xffffffff % b0)) / b0;
228
229	return res;
230}
231
232static void __init synchronize_tsc_bp (void)
233{
234	int i;
235	unsigned long long t0;
236	unsigned long long sum, avg;
237	long long delta;
238	unsigned long one_usec;
239	int buggy = 0;
240
241	printk("checking TSC synchronization across CPUs: ");
242
243	one_usec = ((1<<30)/fast_gettimeoffset_quotient)*(1<<2);
244
245	atomic_set(&tsc_start_flag, 1);
246	wmb();
247
248	/*
249	 * We loop a few times to get a primed instruction cache,
250	 * then the last pass is more or less synchronized and
251	 * the BP and APs set their cycle counters to zero all at
252	 * once. This reduces the chance of having random offsets
253	 * between the processors, and guarantees that the maximum
254	 * delay between the cycle counters is never bigger than
255	 * the latency of information-passing (cachelines) between
256	 * two CPUs.
257	 */
258	for (i = 0; i < NR_LOOPS; i++) {
259		/*
260		 * all APs synchronize but they loop on '== num_cpus'
261		 */
262		while (atomic_read(&tsc_count_start) != smp_num_cpus-1) mb();
263		atomic_set(&tsc_count_stop, 0);
264		wmb();
265		/*
266		 * this lets the APs save their current TSC:
267		 */
268		atomic_inc(&tsc_count_start);
269
270		rdtscll(tsc_values[smp_processor_id()]);
271		/*
272		 * We clear the TSC in the last loop:
273		 */
274		if (i == NR_LOOPS-1)
275			write_tsc(0, 0);
276
277		/*
278		 * Wait for all APs to leave the synchronization point:
279		 */
280		while (atomic_read(&tsc_count_stop) != smp_num_cpus-1) mb();
281		atomic_set(&tsc_count_start, 0);
282		wmb();
283		atomic_inc(&tsc_count_stop);
284	}
285
286	sum = 0;
287	for (i = 0; i < smp_num_cpus; i++) {
288		t0 = tsc_values[i];
289		sum += t0;
290	}
291	avg = div64(sum, smp_num_cpus);
292
293	sum = 0;
294	for (i = 0; i < smp_num_cpus; i++) {
295		delta = tsc_values[i] - avg;
296		if (delta < 0)
297			delta = -delta;
298		/*
299		 * We report bigger than 2 microseconds clock differences.
300		 */
301		if (delta > 2*one_usec) {
302			long realdelta;
303			if (!buggy) {
304				buggy = 1;
305				printk("\n");
306			}
307			realdelta = div64(delta, one_usec);
308			if (tsc_values[i] < avg)
309				realdelta = -realdelta;
310
311			printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n",
312				i, realdelta);
313		}
314
315		sum += delta;
316	}
317	if (!buggy)
318		printk("passed.\n");
319}
320
321static void __init synchronize_tsc_ap (void)
322{
323	int i;
324
325	/*
326	 * smp_num_cpus is not necessarily known at the time
327	 * this gets called, so we first wait for the BP to
328	 * finish SMP initialization:
329	 */
330	while (!atomic_read(&tsc_start_flag)) mb();
331
332	for (i = 0; i < NR_LOOPS; i++) {
333		atomic_inc(&tsc_count_start);
334		while (atomic_read(&tsc_count_start) != smp_num_cpus) mb();
335
336		rdtscll(tsc_values[smp_processor_id()]);
337		if (i == NR_LOOPS-1)
338			write_tsc(0, 0);
339
340		atomic_inc(&tsc_count_stop);
341		while (atomic_read(&tsc_count_stop) != smp_num_cpus) mb();
342	}
343}
344#undef NR_LOOPS
345
346extern void calibrate_delay(void);
347
348static atomic_t init_deasserted;
349
350void __init smp_callin(void)
351{
352	int cpuid, phys_id;
353	unsigned long timeout;
354
355	/*
356	 * If waken up by an INIT in an 82489DX configuration
357	 * we may get here before an INIT-deassert IPI reaches
358	 * our local APIC.  We have to wait for the IPI or we'll
359	 * lock up on an APIC access.
360	 */
361	if (!clustered_apic_mode)
362		while (!atomic_read(&init_deasserted));
363
364	/*
365	 * (This works even if the APIC is not enabled.)
366	 */
367	phys_id = GET_APIC_ID(apic_read(APIC_ID));
368	cpuid = current->processor;
369	if (test_and_set_bit(cpuid, &cpu_online_map)) {
370		printk("huh, phys CPU#%d, CPU#%d already present??\n",
371					phys_id, cpuid);
372		BUG();
373	}
374	Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
375
376	/*
377	 * STARTUP IPIs are fragile beasts as they might sometimes
378	 * trigger some glue motherboard logic. Complete APIC bus
379	 * silence for 1 second, this overestimates the time the
380	 * boot CPU is spending to send the up to 2 STARTUP IPIs
381	 * by a factor of two. This should be enough.
382	 */
383
384	/*
385	 * Waiting 2s total for startup (udelay is not yet working)
386	 */
387	timeout = jiffies + 2*HZ;
388	while (time_before(jiffies, timeout)) {
389		/*
390		 * Has the boot CPU finished it's STARTUP sequence?
391		 */
392		if (test_bit(cpuid, &cpu_callout_map))
393			break;
394		rep_nop();
395	}
396
397	if (!time_before(jiffies, timeout)) {
398		printk("BUG: CPU%d started up but did not get a callout!\n",
399			cpuid);
400		BUG();
401	}
402
403	/*
404	 * the boot CPU has finished the init stage and is spinning
405	 * on callin_map until we finish. We are free to set up this
406	 * CPU, first the APIC. (this is probably redundant on most
407	 * boards)
408	 */
409
410	Dprintk("CALLIN, before setup_local_APIC().\n");
411	/*
412	 * Because we use NMIs rather than the INIT-STARTUP sequence to
413	 * bootstrap the CPUs, the APIC may be in a wierd state. Kick it.
414	 */
415	if (clustered_apic_mode)
416		clear_local_APIC();
417	setup_local_APIC();
418
419	__sti();
420
421#ifdef CONFIG_MTRR
422	/*
423	 * Must be done before calibration delay is computed
424	 */
425	mtrr_init_secondary_cpu ();
426#endif
427	/*
428	 * Get our bogomips.
429	 */
430	calibrate_delay();
431	Dprintk("Stack at about %p\n",&cpuid);
432
433	/*
434	 * Save our processor parameters
435	 */
436 	smp_store_cpu_info(cpuid);
437
438	/*
439	 * Allow the master to continue.
440	 */
441	set_bit(cpuid, &cpu_callin_map);
442
443	/*
444	 *      Synchronize the TSC with the BP
445	 */
446	if (cpu_has_tsc)
447		synchronize_tsc_ap();
448}
449
450int cpucount;
451
452extern int cpu_idle(void);
453
454/*
455 * Activate a secondary processor.
456 */
457int __init start_secondary(void *unused)
458{
459	/*
460	 * Dont put anything before smp_callin(), SMP
461	 * booting is too fragile that we want to limit the
462	 * things done here to the most necessary things.
463	 */
464	cpu_init();
465	smp_callin();
466	while (!atomic_read(&smp_commenced))
467		rep_nop();
468	/*
469	 * low-memory mappings have been cleared, flush them from
470	 * the local TLBs too.
471	 */
472	local_flush_tlb();
473
474	return cpu_idle();
475}
476
477/*
478 * Everything has been set up for the secondary
479 * CPUs - they just need to reload everything
480 * from the task structure
481 * This function must not return.
482 */
483void __init initialize_secondary(void)
484{
485	/*
486	 * We don't actually need to load the full TSS,
487	 * basically just the stack pointer and the eip.
488	 */
489
490	asm volatile(
491		"movl %0,%%esp\n\t"
492		"jmp *%1"
493		:
494		:"r" (current->thread.esp),"r" (current->thread.eip));
495}
496
497extern struct {
498	void * esp;
499	unsigned short ss;
500} stack_start;
501
502static int __init fork_by_hand(void)
503{
504	struct pt_regs regs;
505	/*
506	 * don't care about the eip and regs settings since
507	 * we'll never reschedule the forked task.
508	 */
509	return do_fork(CLONE_VM|CLONE_PID, 0, &regs, 0);
510}
511
512/* which physical APIC ID maps to which logical CPU number */
513volatile int physical_apicid_2_cpu[MAX_APICID];
514/* which logical CPU number maps to which physical APIC ID */
515volatile int cpu_2_physical_apicid[NR_CPUS];
516
517/* which logical APIC ID maps to which logical CPU number */
518volatile int logical_apicid_2_cpu[MAX_APICID];
519/* which logical CPU number maps to which logical APIC ID */
520volatile int cpu_2_logical_apicid[NR_CPUS];
521
522static inline void init_cpu_to_apicid(void)
523/* Initialize all maps between cpu number and apicids */
524{
525	int apicid, cpu;
526
527	for (apicid = 0; apicid < MAX_APICID; apicid++) {
528		physical_apicid_2_cpu[apicid] = -1;
529		logical_apicid_2_cpu[apicid] = -1;
530	}
531	for (cpu = 0; cpu < NR_CPUS; cpu++) {
532		cpu_2_physical_apicid[cpu] = -1;
533		cpu_2_logical_apicid[cpu] = -1;
534	}
535}
536
537static inline void map_cpu_to_boot_apicid(int cpu, int apicid)
538/*
539 * set up a mapping between cpu and apicid. Uses logical apicids for multiquad,
540 * else physical apic ids
541 */
542{
543	if (clustered_apic_mode) {
544		logical_apicid_2_cpu[apicid] = cpu;
545		cpu_2_logical_apicid[cpu] = apicid;
546	} else {
547		physical_apicid_2_cpu[apicid] = cpu;
548		cpu_2_physical_apicid[cpu] = apicid;
549	}
550}
551
552static inline void unmap_cpu_to_boot_apicid(int cpu, int apicid)
553/*
554 * undo a mapping between cpu and apicid. Uses logical apicids for multiquad,
555 * else physical apic ids
556 */
557{
558	if (clustered_apic_mode) {
559		logical_apicid_2_cpu[apicid] = -1;
560		cpu_2_logical_apicid[cpu] = -1;
561	} else {
562		physical_apicid_2_cpu[apicid] = -1;
563		cpu_2_physical_apicid[cpu] = -1;
564	}
565}
566
567#if APIC_DEBUG
568static inline void inquire_remote_apic(int apicid)
569{
570	int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
571	char *names[] = { "ID", "VERSION", "SPIV" };
572	int timeout, status;
573
574	printk("Inquiring remote APIC #%d...\n", apicid);
575
576	for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
577		printk("... APIC #%d %s: ", apicid, names[i]);
578
579		/*
580		 * Wait for idle.
581		 */
582		apic_wait_icr_idle();
583
584		apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
585		apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
586
587		timeout = 0;
588		do {
589			udelay(100);
590			status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
591		} while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
592
593		switch (status) {
594		case APIC_ICR_RR_VALID:
595			status = apic_read(APIC_RRR);
596			printk("%08x\n", status);
597			break;
598		default:
599			printk("failed\n");
600		}
601	}
602}
603#endif
604
605static int wakeup_secondary_via_NMI(int logical_apicid)
606/*
607 * Poke the other CPU in the eye to wake it up. Remember that the normal
608 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
609 * won't ... remember to clear down the APIC, etc later.
610 */
611{
612	unsigned long send_status = 0, accept_status = 0;
613	int timeout, maxlvt;
614
615	/* Target chip */
616	apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
617
618	/* Boot on the stack */
619	/* Kick the second */
620	apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
621
622	Dprintk("Waiting for send to finish...\n");
623	timeout = 0;
624	do {
625		Dprintk("+");
626		udelay(100);
627		send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
628	} while (send_status && (timeout++ < 1000));
629
630	/*
631	 * Give the other CPU some time to accept the IPI.
632	 */
633	udelay(200);
634	/*
635	 * Due to the Pentium erratum 3AP.
636	 */
637	maxlvt = get_maxlvt();
638	if (maxlvt > 3) {
639		apic_read_around(APIC_SPIV);
640		apic_write(APIC_ESR, 0);
641	}
642	accept_status = (apic_read(APIC_ESR) & 0xEF);
643	Dprintk("NMI sent.\n");
644
645	if (send_status)
646		printk("APIC never delivered???\n");
647	if (accept_status)
648		printk("APIC delivery error (%lx).\n", accept_status);
649
650	return (send_status | accept_status);
651}
652
653static int wakeup_secondary_via_INIT(int phys_apicid, unsigned long start_eip)
654{
655	unsigned long send_status = 0, accept_status = 0;
656	int maxlvt, timeout, num_starts, j;
657
658	Dprintk("Asserting INIT.\n");
659
660	/*
661	 * Turn INIT on target chip
662	 */
663	apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
664
665	/*
666	 * Send IPI
667	 */
668	apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
669				| APIC_DM_INIT);
670
671	Dprintk("Waiting for send to finish...\n");
672	timeout = 0;
673	do {
674		Dprintk("+");
675		udelay(100);
676		send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
677	} while (send_status && (timeout++ < 1000));
678
679	mdelay(10);
680
681	Dprintk("Deasserting INIT.\n");
682
683	/* Target chip */
684	apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
685
686	/* Send IPI */
687	apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
688
689	Dprintk("Waiting for send to finish...\n");
690	timeout = 0;
691	do {
692		Dprintk("+");
693		udelay(100);
694		send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
695	} while (send_status && (timeout++ < 1000));
696
697	atomic_set(&init_deasserted, 1);
698
699	/*
700	 * Should we send STARTUP IPIs ?
701	 *
702	 * Determine this based on the APIC version.
703	 * If we don't have an integrated APIC, don't send the STARTUP IPIs.
704	 */
705	if (APIC_INTEGRATED(apic_version[phys_apicid]))
706		num_starts = 2;
707	else
708		num_starts = 0;
709
710	/*
711	 * Run STARTUP IPI loop.
712	 */
713	Dprintk("#startup loops: %d.\n", num_starts);
714
715	maxlvt = get_maxlvt();
716
717	for (j = 1; j <= num_starts; j++) {
718		Dprintk("Sending STARTUP #%d.\n",j);
719		apic_read_around(APIC_SPIV);
720		apic_write(APIC_ESR, 0);
721		apic_read(APIC_ESR);
722		Dprintk("After apic_write.\n");
723
724		/*
725		 * STARTUP IPI
726		 */
727
728		/* Target chip */
729		apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
730
731		/* Boot on the stack */
732		/* Kick the second */
733		apic_write_around(APIC_ICR, APIC_DM_STARTUP
734					| (start_eip >> 12));
735
736		/*
737		 * Give the other CPU some time to accept the IPI.
738		 */
739		udelay(300);
740
741		Dprintk("Startup point 1.\n");
742
743		Dprintk("Waiting for send to finish...\n");
744		timeout = 0;
745		do {
746			Dprintk("+");
747			udelay(100);
748			send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
749		} while (send_status && (timeout++ < 1000));
750
751		/*
752		 * Give the other CPU some time to accept the IPI.
753		 */
754		udelay(200);
755		/*
756		 * Due to the Pentium erratum 3AP.
757		 */
758		if (maxlvt > 3) {
759			apic_read_around(APIC_SPIV);
760			apic_write(APIC_ESR, 0);
761		}
762		accept_status = (apic_read(APIC_ESR) & 0xEF);
763		if (send_status || accept_status)
764			break;
765	}
766	Dprintk("After Startup.\n");
767
768	if (send_status)
769		printk("APIC never delivered???\n");
770	if (accept_status)
771		printk("APIC delivery error (%lx).\n", accept_status);
772
773	return (send_status | accept_status);
774}
775
776extern unsigned long cpu_initialized;
777
778static void __init do_boot_cpu (int apicid)
779/*
780 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
781 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
782 */
783{
784	struct task_struct *idle;
785	unsigned long boot_error = 0;
786	int timeout, cpu;
787	unsigned long start_eip;
788	unsigned short nmi_high, nmi_low;
789
790	cpu = ++cpucount;
791	/*
792	 * We can't use kernel_thread since we must avoid to
793	 * reschedule the child.
794	 */
795	if (fork_by_hand() < 0)
796		panic("failed fork for CPU %d", cpu);
797
798	/*
799	 * We remove it from the pidhash and the runqueue
800	 * once we got the process:
801	 */
802	idle = init_task.prev_task;
803	if (!idle)
804		panic("No idle process for CPU %d", cpu);
805
806	idle->processor = cpu;
807	idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */
808
809	map_cpu_to_boot_apicid(cpu, apicid);
810
811	idle->thread.eip = (unsigned long) start_secondary;
812
813	del_from_runqueue(idle);
814	unhash_process(idle);
815	init_tasks[cpu] = idle;
816
817	/* start_eip had better be page-aligned! */
818	start_eip = setup_trampoline();
819
820	/* So we see what's up   */
821	printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
822	stack_start.esp = (void *) (1024 + PAGE_SIZE + (char *)idle);
823
824	/*
825	 * This grunge runs the startup process for
826	 * the targeted processor.
827	 */
828
829	atomic_set(&init_deasserted, 0);
830
831	Dprintk("Setting warm reset code and vector.\n");
832
833	if (clustered_apic_mode == CLUSTERED_APIC_NUMAQ) {
834		/* stash the current NMI vector, so we can put things back */
835		nmi_high = *((volatile unsigned short *) TRAMPOLINE_HIGH);
836		nmi_low = *((volatile unsigned short *) TRAMPOLINE_LOW);
837	}
838
839	CMOS_WRITE(0xa, 0xf);
840	local_flush_tlb();
841	Dprintk("1.\n");
842	*((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4;
843	Dprintk("2.\n");
844	*((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf;
845	Dprintk("3.\n");
846
847	/*
848	 * Be paranoid about clearing APIC errors.
849	 */
850	if (!clustered_apic_mode && APIC_INTEGRATED(apic_version[apicid])) {
851		apic_read_around(APIC_SPIV);
852		apic_write(APIC_ESR, 0);
853		apic_read(APIC_ESR);
854	}
855
856	/*
857	 * Status is now clean
858	 */
859	boot_error = 0;
860
861	/*
862	 * Starting actual IPI sequence...
863	 */
864
865	if (clustered_apic_mode == CLUSTERED_APIC_NUMAQ)
866		boot_error = wakeup_secondary_via_NMI(apicid);
867	else
868		boot_error = wakeup_secondary_via_INIT(apicid, start_eip);
869
870	if (!boot_error) {
871		/*
872		 * allow APs to start initializing.
873		 */
874		Dprintk("Before Callout %d.\n", cpu);
875		set_bit(cpu, &cpu_callout_map);
876		Dprintk("After Callout %d.\n", cpu);
877
878		/*
879		 * Wait 5s total for a response
880		 */
881		for (timeout = 0; timeout < 50000; timeout++) {
882			if (test_bit(cpu, &cpu_callin_map))
883				break;	/* It has booted */
884			udelay(100);
885		}
886
887		if (test_bit(cpu, &cpu_callin_map)) {
888			/* number CPUs logically, starting from 1 (BSP is 0) */
889			Dprintk("OK.\n");
890			printk("CPU%d: ", cpu);
891			print_cpu_info(&cpu_data[cpu]);
892			Dprintk("CPU has booted.\n");
893		} else {
894			boot_error= 1;
895			if (*((volatile unsigned char *)phys_to_virt(8192))
896					== 0xA5)
897				/* trampoline started but...? */
898				printk("Stuck ??\n");
899			else
900				/* trampoline code not run */
901				printk("Not responding.\n");
902#if APIC_DEBUG
903			if (!clustered_apic_mode)
904				inquire_remote_apic(apicid);
905#endif
906		}
907	}
908	if (boot_error) {
909		/* Try to put things back the way they were before ... */
910		unmap_cpu_to_boot_apicid(cpu, apicid);
911		clear_bit(cpu, &cpu_callout_map); /* was set here (do_boot_cpu()) */
912		clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
913		clear_bit(cpu, &cpu_online_map);  /* was set in smp_callin() */
914		cpucount--;
915	}
916
917	/* mark "stuck" area as not stuck */
918	*((volatile unsigned long *)phys_to_virt(8192)) = 0;
919
920	if(clustered_apic_mode == CLUSTERED_APIC_NUMAQ) {
921		printk("Restoring NMI vector\n");
922		*((volatile unsigned short *) TRAMPOLINE_HIGH) = nmi_high;
923		*((volatile unsigned short *) TRAMPOLINE_LOW) = nmi_low;
924	}
925}
926
927cycles_t cacheflush_time;
928
929static void smp_tune_scheduling (void)
930{
931	unsigned long cachesize;       /* kB   */
932	unsigned long bandwidth = 350; /* MB/s */
933	/*
934	 * Rough estimation for SMP scheduling, this is the number of
935	 * cycles it takes for a fully memory-limited process to flush
936	 * the SMP-local cache.
937	 *
938	 * (For a P5 this pretty much means we will choose another idle
939	 *  CPU almost always at wakeup time (this is due to the small
940	 *  L1 cache), on PIIs it's around 50-100 usecs, depending on
941	 *  the cache size)
942	 */
943
944	if (!cpu_khz) {
945		/*
946		 * this basically disables processor-affinity
947		 * scheduling on SMP without a TSC.
948		 */
949		cacheflush_time = 0;
950		return;
951	} else {
952		cachesize = boot_cpu_data.x86_cache_size;
953		if (cachesize == -1) {
954			cachesize = 16; /* Pentiums, 2x8kB cache */
955			bandwidth = 100;
956		}
957
958		cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth;
959	}
960
961	printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n",
962		(long)cacheflush_time/(cpu_khz/1000),
963		((long)cacheflush_time*100/(cpu_khz/1000)) % 100);
964}
965
966/*
967 * Cycle through the processors sending APIC IPIs to boot each.
968 */
969
970static int boot_cpu_logical_apicid;
971/* Where the IO area was mapped on multiquad, always 0 otherwise */
972void *xquad_portio;
973
974int cpu_sibling_map[NR_CPUS] __cacheline_aligned;
975
976void __init smp_boot_cpus(void)
977{
978	int apicid, cpu, bit;
979
980        if ((clustered_apic_mode == CLUSTERED_APIC_NUMAQ) && (numnodes > 1)) {
981                printk("Remapping cross-quad port I/O for %d quads\n",
982			numnodes);
983                printk("xquad_portio vaddr 0x%08lx, len %08lx\n",
984                        (u_long) xquad_portio,
985			(u_long) numnodes * XQUAD_PORTIO_LEN);
986                xquad_portio = ioremap (XQUAD_PORTIO_BASE,
987			numnodes * XQUAD_PORTIO_LEN);
988        }
989
990#ifdef CONFIG_MTRR
991	/*  Must be done before other processors booted  */
992	mtrr_init_boot_cpu ();
993#endif
994	/*
995	 * Initialize the logical to physical CPU number mapping
996	 */
997
998	init_cpu_to_apicid();
999
1000	/*
1001	 * Setup boot CPU information
1002	 */
1003	smp_store_cpu_info(0); /* Final full version of the data */
1004	printk("CPU%d: ", 0);
1005	print_cpu_info(&cpu_data[0]);
1006
1007	/*
1008	 * We have the boot CPU online for sure.
1009	 */
1010	set_bit(0, &cpu_online_map);
1011	boot_cpu_logical_apicid = logical_smp_processor_id();
1012	map_cpu_to_boot_apicid(0, boot_cpu_apicid);
1013
1014	global_irq_holder = 0;
1015	current->processor = 0;
1016	init_idle();
1017	smp_tune_scheduling();
1018
1019	/*
1020	 * If we couldnt find an SMP configuration at boot time,
1021	 * get out of here now!
1022	 */
1023	if (!smp_found_config) {
1024		printk(KERN_NOTICE "SMP motherboard not detected.\n");
1025#ifndef CONFIG_VISWS
1026		io_apic_irqs = 0;
1027#endif
1028		cpu_online_map = phys_cpu_present_map = 1;
1029		smp_num_cpus = 1;
1030		if (APIC_init_uniprocessor())
1031			printk(KERN_NOTICE "Local APIC not detected."
1032					   " Using dummy APIC emulation.\n");
1033		goto smp_done;
1034	}
1035
1036	/*
1037	 * Should not be necessary because the MP table should list the boot
1038	 * CPU too, but we do it for the sake of robustness anyway.
1039	 * Makes no sense to do this check in clustered apic mode, so skip it
1040	 */
1041	if (!clustered_apic_mode &&
1042	    !test_bit(boot_cpu_physical_apicid, &phys_cpu_present_map)) {
1043		printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
1044							boot_cpu_physical_apicid);
1045		phys_cpu_present_map |= (1 << hard_smp_processor_id());
1046	}
1047
1048	/*
1049	 * If we couldn't find a local APIC, then get out of here now!
1050	 */
1051	if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
1052	    !test_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability)) {
1053		printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
1054			boot_cpu_physical_apicid);
1055		printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
1056#ifndef CONFIG_VISWS
1057		io_apic_irqs = 0;
1058#endif
1059		cpu_online_map = phys_cpu_present_map = 1;
1060		smp_num_cpus = 1;
1061		goto smp_done;
1062	}
1063
1064	verify_local_APIC();
1065
1066	/*
1067	 * If SMP should be disabled, then really disable it!
1068	 */
1069	if (!max_cpus) {
1070		smp_found_config = 0;
1071		printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
1072#ifndef CONFIG_VISWS
1073		io_apic_irqs = 0;
1074#endif
1075		cpu_online_map = phys_cpu_present_map = 1;
1076		smp_num_cpus = 1;
1077		goto smp_done;
1078	}
1079
1080	connect_bsp_APIC();
1081	setup_local_APIC();
1082
1083	if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid)
1084		BUG();
1085
1086	/*
1087	 * Scan the CPU present map and fire up the other CPUs via do_boot_cpu
1088	 *
1089	 * In clustered apic mode, phys_cpu_present_map is a constructed thus:
1090	 * bits 0-3 are quad0, 4-7 are quad1, etc. A perverse twist on the
1091	 * clustered apic ID.
1092	 */
1093	Dprintk("CPU present map: %lx\n", phys_cpu_present_map);
1094
1095	for (bit = 0; bit < NR_CPUS; bit++) {
1096		apicid = cpu_present_to_apicid(bit);
1097		/*
1098		 * Don't even attempt to start the boot CPU!
1099		 */
1100		if (apicid == boot_cpu_apicid)
1101			continue;
1102
1103		if (!(phys_cpu_present_map & (1 << bit)))
1104			continue;
1105		if ((max_cpus >= 0) && (max_cpus <= cpucount+1))
1106			continue;
1107
1108		do_boot_cpu(apicid);
1109
1110		/*
1111		 * Make sure we unmap all failed CPUs
1112		 */
1113		if ((boot_apicid_to_cpu(apicid) == -1) &&
1114				(phys_cpu_present_map & (1 << bit)))
1115			printk("CPU #%d not responding - cannot use it.\n",
1116								apicid);
1117	}
1118
1119	/*
1120	 * Cleanup possible dangling ends...
1121	 */
1122#ifndef CONFIG_VISWS
1123	{
1124		/*
1125		 * Install writable page 0 entry to set BIOS data area.
1126		 */
1127		local_flush_tlb();
1128
1129		/*
1130		 * Paranoid:  Set warm reset code and vector here back
1131		 * to default values.
1132		 */
1133		CMOS_WRITE(0, 0xf);
1134
1135		*((volatile long *) phys_to_virt(0x467)) = 0;
1136	}
1137#endif
1138
1139	/*
1140	 * Allow the user to impress friends.
1141	 */
1142
1143	Dprintk("Before bogomips.\n");
1144	if (!cpucount) {
1145		printk(KERN_ERR "Error: only one processor found.\n");
1146	} else {
1147		unsigned long bogosum = 0;
1148		for (cpu = 0; cpu < NR_CPUS; cpu++)
1149			if (cpu_online_map & (1<<cpu))
1150				bogosum += cpu_data[cpu].loops_per_jiffy;
1151		printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
1152			cpucount+1,
1153			bogosum/(500000/HZ),
1154			(bogosum/(5000/HZ))%100);
1155		Dprintk("Before bogocount - setting activated=1.\n");
1156	}
1157	smp_num_cpus = cpucount + 1;
1158
1159	if (smp_b_stepping)
1160		printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n");
1161	Dprintk("Boot done.\n");
1162
1163	/*
1164	 * If Hyper-Threading is avaialble, construct cpu_sibling_map[], so
1165	 * that we can tell the sibling CPU efficiently.
1166	 */
1167	if (test_bit(X86_FEATURE_HT, boot_cpu_data.x86_capability)
1168	    && smp_num_siblings > 1) {
1169		for (cpu = 0; cpu < NR_CPUS; cpu++)
1170			cpu_sibling_map[cpu] = NO_PROC_ID;
1171
1172		for (cpu = 0; cpu < smp_num_cpus; cpu++) {
1173			int 	i;
1174
1175			for (i = 0; i < smp_num_cpus; i++) {
1176				if (i == cpu)
1177					continue;
1178				if (phys_proc_id[cpu] == phys_proc_id[i]) {
1179					cpu_sibling_map[cpu] = i;
1180					printk("cpu_sibling_map[%d] = %d\n", cpu, cpu_sibling_map[cpu]);
1181					break;
1182				}
1183			}
1184			if (cpu_sibling_map[cpu] == NO_PROC_ID) {
1185				smp_num_siblings = 1;
1186				printk(KERN_WARNING "WARNING: No sibling found for CPU %d.\n", cpu);
1187			}
1188		}
1189	}
1190
1191#ifndef CONFIG_VISWS
1192	/*
1193	 * Here we can be sure that there is an IO-APIC in the system. Let's
1194	 * go and set it up:
1195	 */
1196	if (!skip_ioapic_setup && nr_ioapics)
1197		setup_IO_APIC();
1198#endif
1199
1200	/*
1201	 * Set up all local APIC timers in the system:
1202	 */
1203	setup_APIC_clocks();
1204
1205#if defined(CONFIG_KERNPROF)
1206	/*
1207	 * Set up all local APIC performance counter overflow vectors,
1208	 * if available:
1209	 */
1210	if (cpu_has_msr && boot_cpu_data.x86 == 6)
1211		setup_APIC_perfctr();
1212#endif
1213
1214	/*
1215	 * Synchronize the TSC with the AP
1216	 */
1217	if (cpu_has_tsc && cpucount)
1218		synchronize_tsc_bp();
1219
1220smp_done:
1221	zap_low_mappings();
1222}
1223