1/* 2 * x86 SMP booting functions 3 * 4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> 5 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> 6 * 7 * Much of the core SMP work is based on previous work by Thomas Radke, to 8 * whom a great many thanks are extended. 9 * 10 * Thanks to Intel for making available several different Pentium, 11 * Pentium Pro and Pentium-II/Xeon MP machines. 12 * Original development of Linux SMP code supported by Caldera. 13 * 14 * This code is released under the GNU General Public License version 2 or 15 * later. 16 * 17 * Fixes 18 * Felix Koop : NR_CPUS used properly 19 * Jose Renau : Handle single CPU case. 20 * Alan Cox : By repeated request 8) - Total BogoMIPS report. 21 * Greg Wright : Fix for kernel stacks panic. 22 * Erich Boleyn : MP v1.4 and additional changes. 23 * Matthias Sattler : Changes for 2.1 kernel map. 24 * Michel Lespinasse : Changes for 2.1 kernel map. 25 * Michael Chastain : Change trampoline.S to gnu as. 26 * Alan Cox : Dumb bug: 'B' step PPro's are fine 27 * Ingo Molnar : Added APIC timers, based on code 28 * from Jose Renau 29 * Ingo Molnar : various cleanups and rewrites 30 * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug. 31 * Maciej W. Rozycki : Bits for genuine 82489DX APICs 32 * Martin J. Bligh : Added support for multi-quad systems 33 * Dave Jones : Report invalid combinations of Athlon CPUs. 34* Rusty Russell : Hacked into shape for new "hotplug" boot process. */ 35 36#include <linux/module.h> 37#include <linux/init.h> 38#include <linux/kernel.h> 39 40#include <linux/mm.h> 41#include <linux/sched.h> 42#include <linux/kernel_stat.h> 43#include <linux/bootmem.h> 44#include <linux/notifier.h> 45#include <linux/cpu.h> 46#include <linux/percpu.h> 47#include <linux/nmi.h> 48 49#include <linux/delay.h> 50#include <linux/mc146818rtc.h> 51#include <asm/tlbflush.h> 52#include <asm/desc.h> 53#include <asm/arch_hooks.h> 54#include <asm/nmi.h> 55 56#include <mach_apic.h> 57#include <mach_wakecpu.h> 58#include <smpboot_hooks.h> 59#include <asm/vmi.h> 60#include <asm/mtrr.h> 61 62/* Set if we find a B stepping CPU */ 63static int __devinitdata smp_b_stepping; 64 65/* Number of siblings per CPU package */ 66int smp_num_siblings = 1; 67EXPORT_SYMBOL(smp_num_siblings); 68 69/* Last level cache ID of each logical CPU */ 70int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; 71 72/* representing HT siblings of each logical CPU */ 73cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; 74EXPORT_SYMBOL(cpu_sibling_map); 75 76/* representing HT and core siblings of each logical CPU */ 77cpumask_t cpu_core_map[NR_CPUS] __read_mostly; 78EXPORT_SYMBOL(cpu_core_map); 79 80/* bitmap of online cpus */ 81cpumask_t cpu_online_map __read_mostly; 82EXPORT_SYMBOL(cpu_online_map); 83 84cpumask_t cpu_callin_map; 85cpumask_t cpu_callout_map; 86EXPORT_SYMBOL(cpu_callout_map); 87cpumask_t cpu_possible_map; 88EXPORT_SYMBOL(cpu_possible_map); 89static cpumask_t smp_commenced_mask; 90 91/* Per CPU bogomips and other parameters */ 92struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; 93EXPORT_SYMBOL(cpu_data); 94 95u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = 96 { [0 ... NR_CPUS-1] = 0xff }; 97EXPORT_SYMBOL(x86_cpu_to_apicid); 98 99u8 apicid_2_node[MAX_APICID]; 100 101/* 102 * Trampoline 80x86 program as an array. 103 */ 104 105extern unsigned char trampoline_data []; 106extern unsigned char trampoline_end []; 107static unsigned char *trampoline_base; 108static int trampoline_exec; 109 110static void map_cpu_to_logical_apicid(void); 111 112/* State of each CPU. */ 113DEFINE_PER_CPU(int, cpu_state) = { 0 }; 114 115/* 116 * Currently trivial. Write the real->protected mode 117 * bootstrap into the page concerned. The caller 118 * has made sure it's suitably aligned. 119 */ 120 121static unsigned long __devinit setup_trampoline(void) 122{ 123 memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data); 124 return virt_to_phys(trampoline_base); 125} 126 127/* 128 * We are called very early to get the low memory for the 129 * SMP bootup trampoline page. 130 */ 131void __init smp_alloc_memory(void) 132{ 133 trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE); 134 /* 135 * Has to be in very low memory so we can execute 136 * real-mode AP code. 137 */ 138 if (__pa(trampoline_base) >= 0x9F000) 139 BUG(); 140 /* 141 * Make the SMP trampoline executable: 142 */ 143 trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1); 144} 145 146/* 147 * The bootstrap kernel entry code has set these up. Save them for 148 * a given CPU 149 */ 150 151static void __cpuinit smp_store_cpu_info(int id) 152{ 153 struct cpuinfo_x86 *c = cpu_data + id; 154 155 *c = boot_cpu_data; 156 if (id!=0) 157 identify_secondary_cpu(c); 158 /* 159 * Mask B, Pentium, but not Pentium MMX 160 */ 161 if (c->x86_vendor == X86_VENDOR_INTEL && 162 c->x86 == 5 && 163 c->x86_mask >= 1 && c->x86_mask <= 4 && 164 c->x86_model <= 3) 165 /* 166 * Remember we have B step Pentia with bugs 167 */ 168 smp_b_stepping = 1; 169 170 /* 171 * Certain Athlons might work (for various values of 'work') in SMP 172 * but they are not certified as MP capable. 173 */ 174 if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) { 175 176 if (num_possible_cpus() == 1) 177 goto valid_k7; 178 179 /* Athlon 660/661 is valid. */ 180 if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1))) 181 goto valid_k7; 182 183 /* Duron 670 is valid */ 184 if ((c->x86_model==7) && (c->x86_mask==0)) 185 goto valid_k7; 186 187 /* 188 * Athlon 662, Duron 671, and Athlon >model 7 have capability bit. 189 * It's worth noting that the A5 stepping (662) of some Athlon XP's 190 * have the MP bit set. 191 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for more. 192 */ 193 if (((c->x86_model==6) && (c->x86_mask>=2)) || 194 ((c->x86_model==7) && (c->x86_mask>=1)) || 195 (c->x86_model> 7)) 196 if (cpu_has_mp) 197 goto valid_k7; 198 199 /* If we get here, it's not a certified SMP capable AMD system. */ 200 add_taint(TAINT_UNSAFE_SMP); 201 } 202 203valid_k7: 204 ; 205} 206 207extern void calibrate_delay(void); 208 209static atomic_t init_deasserted; 210 211static void __cpuinit smp_callin(void) 212{ 213 int cpuid, phys_id; 214 unsigned long timeout; 215 216 /* 217 * If waken up by an INIT in an 82489DX configuration 218 * we may get here before an INIT-deassert IPI reaches 219 * our local APIC. We have to wait for the IPI or we'll 220 * lock up on an APIC access. 221 */ 222 wait_for_init_deassert(&init_deasserted); 223 224 /* 225 * (This works even if the APIC is not enabled.) 226 */ 227 phys_id = GET_APIC_ID(apic_read(APIC_ID)); 228 cpuid = smp_processor_id(); 229 if (cpu_isset(cpuid, cpu_callin_map)) { 230 printk("huh, phys CPU#%d, CPU#%d already present??\n", 231 phys_id, cpuid); 232 BUG(); 233 } 234 Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); 235 236 /* 237 * STARTUP IPIs are fragile beasts as they might sometimes 238 * trigger some glue motherboard logic. Complete APIC bus 239 * silence for 1 second, this overestimates the time the 240 * boot CPU is spending to send the up to 2 STARTUP IPIs 241 * by a factor of two. This should be enough. 242 */ 243 244 /* 245 * Waiting 2s total for startup (udelay is not yet working) 246 */ 247 timeout = jiffies + 2*HZ; 248 while (time_before(jiffies, timeout)) { 249 /* 250 * Has the boot CPU finished it's STARTUP sequence? 251 */ 252 if (cpu_isset(cpuid, cpu_callout_map)) 253 break; 254 rep_nop(); 255 } 256 257 if (!time_before(jiffies, timeout)) { 258 printk("BUG: CPU%d started up but did not get a callout!\n", 259 cpuid); 260 BUG(); 261 } 262 263 /* 264 * the boot CPU has finished the init stage and is spinning 265 * on callin_map until we finish. We are free to set up this 266 * CPU, first the APIC. (this is probably redundant on most 267 * boards) 268 */ 269 270 Dprintk("CALLIN, before setup_local_APIC().\n"); 271 smp_callin_clear_local_apic(); 272 setup_local_APIC(); 273 map_cpu_to_logical_apicid(); 274 275 /* 276 * Get our bogomips. 277 */ 278 calibrate_delay(); 279 Dprintk("Stack at about %p\n",&cpuid); 280 281 /* 282 * Save our processor parameters 283 */ 284 smp_store_cpu_info(cpuid); 285 286 /* 287 * Allow the master to continue. 288 */ 289 cpu_set(cpuid, cpu_callin_map); 290} 291 292static int cpucount; 293 294/* maps the cpu to the sched domain representing multi-core */ 295cpumask_t cpu_coregroup_map(int cpu) 296{ 297 struct cpuinfo_x86 *c = cpu_data + cpu; 298 /* 299 * For perf, we return last level cache shared map. 300 * And for power savings, we return cpu_core_map 301 */ 302 if (sched_mc_power_savings || sched_smt_power_savings) 303 return cpu_core_map[cpu]; 304 else 305 return c->llc_shared_map; 306} 307 308/* representing cpus for which sibling maps can be computed */ 309static cpumask_t cpu_sibling_setup_map; 310 311static inline void 312set_cpu_sibling_map(int cpu) 313{ 314 int i; 315 struct cpuinfo_x86 *c = cpu_data; 316 317 cpu_set(cpu, cpu_sibling_setup_map); 318 319 if (smp_num_siblings > 1) { 320 for_each_cpu_mask(i, cpu_sibling_setup_map) { 321 if (c[cpu].phys_proc_id == c[i].phys_proc_id && 322 c[cpu].cpu_core_id == c[i].cpu_core_id) { 323 cpu_set(i, cpu_sibling_map[cpu]); 324 cpu_set(cpu, cpu_sibling_map[i]); 325 cpu_set(i, cpu_core_map[cpu]); 326 cpu_set(cpu, cpu_core_map[i]); 327 cpu_set(i, c[cpu].llc_shared_map); 328 cpu_set(cpu, c[i].llc_shared_map); 329 } 330 } 331 } else { 332 cpu_set(cpu, cpu_sibling_map[cpu]); 333 } 334 335 cpu_set(cpu, c[cpu].llc_shared_map); 336 337 if (current_cpu_data.x86_max_cores == 1) { 338 cpu_core_map[cpu] = cpu_sibling_map[cpu]; 339 c[cpu].booted_cores = 1; 340 return; 341 } 342 343 for_each_cpu_mask(i, cpu_sibling_setup_map) { 344 if (cpu_llc_id[cpu] != BAD_APICID && 345 cpu_llc_id[cpu] == cpu_llc_id[i]) { 346 cpu_set(i, c[cpu].llc_shared_map); 347 cpu_set(cpu, c[i].llc_shared_map); 348 } 349 if (c[cpu].phys_proc_id == c[i].phys_proc_id) { 350 cpu_set(i, cpu_core_map[cpu]); 351 cpu_set(cpu, cpu_core_map[i]); 352 /* 353 * Does this new cpu bringup a new core? 354 */ 355 if (cpus_weight(cpu_sibling_map[cpu]) == 1) { 356 /* 357 * for each core in package, increment 358 * the booted_cores for this new cpu 359 */ 360 if (first_cpu(cpu_sibling_map[i]) == i) 361 c[cpu].booted_cores++; 362 /* 363 * increment the core count for all 364 * the other cpus in this package 365 */ 366 if (i != cpu) 367 c[i].booted_cores++; 368 } else if (i != cpu && !c[cpu].booted_cores) 369 c[cpu].booted_cores = c[i].booted_cores; 370 } 371 } 372} 373 374/* 375 * Activate a secondary processor. 376 */ 377static void __cpuinit start_secondary(void *unused) 378{ 379 /* 380 * Don't put *anything* before cpu_init(), SMP booting is too 381 * fragile that we want to limit the things done here to the 382 * most necessary things. 383 */ 384#ifdef CONFIG_VMI 385 vmi_bringup(); 386#endif 387 cpu_init(); 388 preempt_disable(); 389 smp_callin(); 390 while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) 391 rep_nop(); 392 /* 393 * Check TSC synchronization with the BP: 394 */ 395 check_tsc_sync_target(); 396 397 setup_secondary_clock(); 398 if (nmi_watchdog == NMI_IO_APIC) { 399 disable_8259A_irq(0); 400 enable_NMI_through_LVT0(NULL); 401 enable_8259A_irq(0); 402 } 403 /* 404 * low-memory mappings have been cleared, flush them from 405 * the local TLBs too. 406 */ 407 local_flush_tlb(); 408 409 /* This must be done before setting cpu_online_map */ 410 set_cpu_sibling_map(raw_smp_processor_id()); 411 wmb(); 412 413 /* 414 * We need to hold call_lock, so there is no inconsistency 415 * between the time smp_call_function() determines number of 416 * IPI receipients, and the time when the determination is made 417 * for which cpus receive the IPI. Holding this 418 * lock helps us to not include this cpu in a currently in progress 419 * smp_call_function(). 420 */ 421 lock_ipi_call_lock(); 422 cpu_set(smp_processor_id(), cpu_online_map); 423 unlock_ipi_call_lock(); 424 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 425 426 /* We can take interrupts now: we're officially "up". */ 427 local_irq_enable(); 428 429 wmb(); 430 cpu_idle(); 431} 432 433/* 434 * Everything has been set up for the secondary 435 * CPUs - they just need to reload everything 436 * from the task structure 437 * This function must not return. 438 */ 439void __devinit initialize_secondary(void) 440{ 441 /* 442 * We don't actually need to load the full TSS, 443 * basically just the stack pointer and the eip. 444 */ 445 446 asm volatile( 447 "movl %0,%%esp\n\t" 448 "jmp *%1" 449 : 450 :"m" (current->thread.esp),"m" (current->thread.eip)); 451} 452 453/* Static state in head.S used to set up a CPU */ 454extern struct { 455 void * esp; 456 unsigned short ss; 457} stack_start; 458 459#ifdef CONFIG_NUMA 460 461/* which logical CPUs are on which nodes */ 462cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly = 463 { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE }; 464EXPORT_SYMBOL(node_2_cpu_mask); 465/* which node each logical CPU is on */ 466int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; 467EXPORT_SYMBOL(cpu_2_node); 468 469/* set up a mapping between cpu and node. */ 470static inline void map_cpu_to_node(int cpu, int node) 471{ 472 printk("Mapping cpu %d to node %d\n", cpu, node); 473 cpu_set(cpu, node_2_cpu_mask[node]); 474 cpu_2_node[cpu] = node; 475} 476 477/* undo a mapping between cpu and node. */ 478static inline void unmap_cpu_to_node(int cpu) 479{ 480 int node; 481 482 printk("Unmapping cpu %d from all nodes\n", cpu); 483 for (node = 0; node < MAX_NUMNODES; node ++) 484 cpu_clear(cpu, node_2_cpu_mask[node]); 485 cpu_2_node[cpu] = 0; 486} 487#else /* !CONFIG_NUMA */ 488 489#define map_cpu_to_node(cpu, node) ({}) 490#define unmap_cpu_to_node(cpu) ({}) 491 492#endif /* CONFIG_NUMA */ 493 494u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; 495 496static void map_cpu_to_logical_apicid(void) 497{ 498 int cpu = smp_processor_id(); 499 int apicid = logical_smp_processor_id(); 500 int node = apicid_to_node(apicid); 501 502 if (!node_online(node)) 503 node = first_online_node; 504 505 cpu_2_logical_apicid[cpu] = apicid; 506 map_cpu_to_node(cpu, node); 507} 508 509static void unmap_cpu_to_logical_apicid(int cpu) 510{ 511 cpu_2_logical_apicid[cpu] = BAD_APICID; 512 unmap_cpu_to_node(cpu); 513} 514 515static inline void __inquire_remote_apic(int apicid) 516{ 517 int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; 518 char *names[] = { "ID", "VERSION", "SPIV" }; 519 int timeout; 520 unsigned long status; 521 522 printk("Inquiring remote APIC #%d...\n", apicid); 523 524 for (i = 0; i < ARRAY_SIZE(regs); i++) { 525 printk("... APIC #%d %s: ", apicid, names[i]); 526 527 /* 528 * Wait for idle. 529 */ 530 status = safe_apic_wait_icr_idle(); 531 if (status) 532 printk("a previous APIC delivery may have failed\n"); 533 534 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); 535 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); 536 537 timeout = 0; 538 do { 539 udelay(100); 540 status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK; 541 } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000); 542 543 switch (status) { 544 case APIC_ICR_RR_VALID: 545 status = apic_read(APIC_RRR); 546 printk("%lx\n", status); 547 break; 548 default: 549 printk("failed\n"); 550 } 551 } 552} 553 554#ifdef WAKE_SECONDARY_VIA_NMI 555/* 556 * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal 557 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this 558 * won't ... remember to clear down the APIC, etc later. 559 */ 560static int __devinit 561wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) 562{ 563 unsigned long send_status, accept_status = 0; 564 int maxlvt; 565 566 /* Target chip */ 567 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); 568 569 /* Boot on the stack */ 570 /* Kick the second */ 571 apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); 572 573 Dprintk("Waiting for send to finish...\n"); 574 send_status = safe_apic_wait_icr_idle(); 575 576 /* 577 * Give the other CPU some time to accept the IPI. 578 */ 579 udelay(200); 580 /* 581 * Due to the Pentium erratum 3AP. 582 */ 583 maxlvt = lapic_get_maxlvt(); 584 if (maxlvt > 3) { 585 apic_read_around(APIC_SPIV); 586 apic_write(APIC_ESR, 0); 587 } 588 accept_status = (apic_read(APIC_ESR) & 0xEF); 589 Dprintk("NMI sent.\n"); 590 591 if (send_status) 592 printk("APIC never delivered???\n"); 593 if (accept_status) 594 printk("APIC delivery error (%lx).\n", accept_status); 595 596 return (send_status | accept_status); 597} 598#endif /* WAKE_SECONDARY_VIA_NMI */ 599 600#ifdef WAKE_SECONDARY_VIA_INIT 601static int __devinit 602wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) 603{ 604 unsigned long send_status, accept_status = 0; 605 int maxlvt, num_starts, j; 606 607 /* 608 * Be paranoid about clearing APIC errors. 609 */ 610 if (APIC_INTEGRATED(apic_version[phys_apicid])) { 611 apic_read_around(APIC_SPIV); 612 apic_write(APIC_ESR, 0); 613 apic_read(APIC_ESR); 614 } 615 616 Dprintk("Asserting INIT.\n"); 617 618 /* 619 * Turn INIT on target chip 620 */ 621 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 622 623 /* 624 * Send IPI 625 */ 626 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT 627 | APIC_DM_INIT); 628 629 Dprintk("Waiting for send to finish...\n"); 630 send_status = safe_apic_wait_icr_idle(); 631 632 mdelay(10); 633 634 Dprintk("Deasserting INIT.\n"); 635 636 /* Target chip */ 637 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 638 639 /* Send IPI */ 640 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); 641 642 Dprintk("Waiting for send to finish...\n"); 643 send_status = safe_apic_wait_icr_idle(); 644 645 atomic_set(&init_deasserted, 1); 646 647 /* 648 * Should we send STARTUP IPIs ? 649 * 650 * Determine this based on the APIC version. 651 * If we don't have an integrated APIC, don't send the STARTUP IPIs. 652 */ 653 if (APIC_INTEGRATED(apic_version[phys_apicid])) 654 num_starts = 2; 655 else 656 num_starts = 0; 657 658 /* 659 * Paravirt / VMI wants a startup IPI hook here to set up the 660 * target processor state. 661 */ 662 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, 663 (unsigned long) stack_start.esp); 664 665 /* 666 * Run STARTUP IPI loop. 667 */ 668 Dprintk("#startup loops: %d.\n", num_starts); 669 670 maxlvt = lapic_get_maxlvt(); 671 672 for (j = 1; j <= num_starts; j++) { 673 Dprintk("Sending STARTUP #%d.\n",j); 674 apic_read_around(APIC_SPIV); 675 apic_write(APIC_ESR, 0); 676 apic_read(APIC_ESR); 677 Dprintk("After apic_write.\n"); 678 679 /* 680 * STARTUP IPI 681 */ 682 683 /* Target chip */ 684 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 685 686 /* Boot on the stack */ 687 /* Kick the second */ 688 apic_write_around(APIC_ICR, APIC_DM_STARTUP 689 | (start_eip >> 12)); 690 691 /* 692 * Give the other CPU some time to accept the IPI. 693 */ 694 udelay(300); 695 696 Dprintk("Startup point 1.\n"); 697 698 Dprintk("Waiting for send to finish...\n"); 699 send_status = safe_apic_wait_icr_idle(); 700 701 /* 702 * Give the other CPU some time to accept the IPI. 703 */ 704 udelay(200); 705 /* 706 * Due to the Pentium erratum 3AP. 707 */ 708 if (maxlvt > 3) { 709 apic_read_around(APIC_SPIV); 710 apic_write(APIC_ESR, 0); 711 } 712 accept_status = (apic_read(APIC_ESR) & 0xEF); 713 if (send_status || accept_status) 714 break; 715 } 716 Dprintk("After Startup.\n"); 717 718 if (send_status) 719 printk("APIC never delivered???\n"); 720 if (accept_status) 721 printk("APIC delivery error (%lx).\n", accept_status); 722 723 return (send_status | accept_status); 724} 725#endif /* WAKE_SECONDARY_VIA_INIT */ 726 727extern cpumask_t cpu_initialized; 728static inline int alloc_cpu_id(void) 729{ 730 cpumask_t tmp_map; 731 int cpu; 732 cpus_complement(tmp_map, cpu_present_map); 733 cpu = first_cpu(tmp_map); 734 if (cpu >= NR_CPUS) 735 return -ENODEV; 736 return cpu; 737} 738 739#ifdef CONFIG_HOTPLUG_CPU 740static struct task_struct * __devinitdata cpu_idle_tasks[NR_CPUS]; 741static inline struct task_struct * alloc_idle_task(int cpu) 742{ 743 struct task_struct *idle; 744 745 if ((idle = cpu_idle_tasks[cpu]) != NULL) { 746 /* initialize thread_struct. we really want to avoid destroy 747 * idle tread 748 */ 749 idle->thread.esp = (unsigned long)task_pt_regs(idle); 750 init_idle(idle, cpu); 751 return idle; 752 } 753 idle = fork_idle(cpu); 754 755 if (!IS_ERR(idle)) 756 cpu_idle_tasks[cpu] = idle; 757 return idle; 758} 759#else 760#define alloc_idle_task(cpu) fork_idle(cpu) 761#endif 762 763static int __cpuinit do_boot_cpu(int apicid, int cpu) 764/* 765 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 766 * (ie clustered apic addressing mode), this is a LOGICAL apic ID. 767 * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu. 768 */ 769{ 770 struct task_struct *idle; 771 unsigned long boot_error; 772 int timeout; 773 unsigned long start_eip; 774 unsigned short nmi_high = 0, nmi_low = 0; 775 776 /* 777 * Save current MTRR state in case it was changed since early boot 778 * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync: 779 */ 780 mtrr_save_state(); 781 782 /* 783 * We can't use kernel_thread since we must avoid to 784 * reschedule the child. 785 */ 786 idle = alloc_idle_task(cpu); 787 if (IS_ERR(idle)) 788 panic("failed fork for CPU %d", cpu); 789 790 init_gdt(cpu); 791 per_cpu(current_task, cpu) = idle; 792 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 793 794 idle->thread.eip = (unsigned long) start_secondary; 795 /* start_eip had better be page-aligned! */ 796 start_eip = setup_trampoline(); 797 798 ++cpucount; 799 alternatives_smp_switch(1); 800 801 /* So we see what's up */ 802 printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); 803 /* Stack for startup_32 can be just as for start_secondary onwards */ 804 stack_start.esp = (void *) idle->thread.esp; 805 806 irq_ctx_init(cpu); 807 808 x86_cpu_to_apicid[cpu] = apicid; 809 /* 810 * This grunge runs the startup process for 811 * the targeted processor. 812 */ 813 814 atomic_set(&init_deasserted, 0); 815 816 Dprintk("Setting warm reset code and vector.\n"); 817 818 store_NMI_vector(&nmi_high, &nmi_low); 819 820 smpboot_setup_warm_reset_vector(start_eip); 821 822 /* 823 * Starting actual IPI sequence... 824 */ 825 boot_error = wakeup_secondary_cpu(apicid, start_eip); 826 827 if (!boot_error) { 828 /* 829 * allow APs to start initializing. 830 */ 831 Dprintk("Before Callout %d.\n", cpu); 832 cpu_set(cpu, cpu_callout_map); 833 Dprintk("After Callout %d.\n", cpu); 834 835 /* 836 * Wait 5s total for a response 837 */ 838 for (timeout = 0; timeout < 50000; timeout++) { 839 if (cpu_isset(cpu, cpu_callin_map)) 840 break; /* It has booted */ 841 udelay(100); 842 } 843 844 if (cpu_isset(cpu, cpu_callin_map)) { 845 /* number CPUs logically, starting from 1 (BSP is 0) */ 846 Dprintk("OK.\n"); 847 printk("CPU%d: ", cpu); 848 print_cpu_info(&cpu_data[cpu]); 849 Dprintk("CPU has booted.\n"); 850 } else { 851 boot_error= 1; 852 if (*((volatile unsigned char *)trampoline_base) 853 == 0xA5) 854 /* trampoline started but...? */ 855 printk("Stuck ??\n"); 856 else 857 /* trampoline code not run */ 858 printk("Not responding.\n"); 859 inquire_remote_apic(apicid); 860 } 861 } 862 863 if (boot_error) { 864 /* Try to put things back the way they were before ... */ 865 unmap_cpu_to_logical_apicid(cpu); 866 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ 867 cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ 868 cpucount--; 869 } else { 870 x86_cpu_to_apicid[cpu] = apicid; 871 cpu_set(cpu, cpu_present_map); 872 } 873 874 /* mark "stuck" area as not stuck */ 875 *((volatile unsigned long *)trampoline_base) = 0; 876 877 return boot_error; 878} 879 880#ifdef CONFIG_HOTPLUG_CPU 881void cpu_exit_clear(void) 882{ 883 int cpu = raw_smp_processor_id(); 884 885 idle_task_exit(); 886 887 cpucount --; 888 cpu_uninit(); 889 irq_ctx_exit(cpu); 890 891 cpu_clear(cpu, cpu_callout_map); 892 cpu_clear(cpu, cpu_callin_map); 893 894 cpu_clear(cpu, smp_commenced_mask); 895 unmap_cpu_to_logical_apicid(cpu); 896} 897 898struct warm_boot_cpu_info { 899 struct completion *complete; 900 struct work_struct task; 901 int apicid; 902 int cpu; 903}; 904 905static void __cpuinit do_warm_boot_cpu(struct work_struct *work) 906{ 907 struct warm_boot_cpu_info *info = 908 container_of(work, struct warm_boot_cpu_info, task); 909 do_boot_cpu(info->apicid, info->cpu); 910 complete(info->complete); 911} 912 913static int __cpuinit __smp_prepare_cpu(int cpu) 914{ 915 DECLARE_COMPLETION_ONSTACK(done); 916 struct warm_boot_cpu_info info; 917 int apicid, ret; 918 919 apicid = x86_cpu_to_apicid[cpu]; 920 if (apicid == BAD_APICID) { 921 ret = -ENODEV; 922 goto exit; 923 } 924 925 info.complete = &done; 926 info.apicid = apicid; 927 info.cpu = cpu; 928 INIT_WORK(&info.task, do_warm_boot_cpu); 929 930 /* init low mem mapping */ 931 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, 932 min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); 933 flush_tlb_all(); 934 schedule_work(&info.task); 935 wait_for_completion(&done); 936 937 zap_low_mappings(); 938 ret = 0; 939exit: 940 return ret; 941} 942#endif 943 944/* 945 * Cycle through the processors sending APIC IPIs to boot each. 946 */ 947 948static int boot_cpu_logical_apicid; 949/* Where the IO area was mapped on multiquad, always 0 otherwise */ 950void *xquad_portio; 951#ifdef CONFIG_X86_NUMAQ 952EXPORT_SYMBOL(xquad_portio); 953#endif 954 955static void __init smp_boot_cpus(unsigned int max_cpus) 956{ 957 int apicid, cpu, bit, kicked; 958 unsigned long bogosum = 0; 959 960 /* 961 * Setup boot CPU information 962 */ 963 smp_store_cpu_info(0); /* Final full version of the data */ 964 printk("CPU%d: ", 0); 965 print_cpu_info(&cpu_data[0]); 966 967 boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); 968 boot_cpu_logical_apicid = logical_smp_processor_id(); 969 x86_cpu_to_apicid[0] = boot_cpu_physical_apicid; 970 971 current_thread_info()->cpu = 0; 972 973 set_cpu_sibling_map(0); 974 975 /* 976 * If we couldn't find an SMP configuration at boot time, 977 * get out of here now! 978 */ 979 if (!smp_found_config && !acpi_lapic) { 980 printk(KERN_NOTICE "SMP motherboard not detected.\n"); 981 smpboot_clear_io_apic_irqs(); 982 phys_cpu_present_map = physid_mask_of_physid(0); 983 if (APIC_init_uniprocessor()) 984 printk(KERN_NOTICE "Local APIC not detected." 985 " Using dummy APIC emulation.\n"); 986 map_cpu_to_logical_apicid(); 987 cpu_set(0, cpu_sibling_map[0]); 988 cpu_set(0, cpu_core_map[0]); 989 return; 990 } 991 992 /* 993 * Should not be necessary because the MP table should list the boot 994 * CPU too, but we do it for the sake of robustness anyway. 995 * Makes no sense to do this check in clustered apic mode, so skip it 996 */ 997 if (!check_phys_apicid_present(boot_cpu_physical_apicid)) { 998 printk("weird, boot CPU (#%d) not listed by the BIOS.\n", 999 boot_cpu_physical_apicid); 1000 physid_set(hard_smp_processor_id(), phys_cpu_present_map); 1001 } 1002 1003 /* 1004 * If we couldn't find a local APIC, then get out of here now! 1005 */ 1006 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) { 1007 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", 1008 boot_cpu_physical_apicid); 1009 printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); 1010 smpboot_clear_io_apic_irqs(); 1011 phys_cpu_present_map = physid_mask_of_physid(0); 1012 cpu_set(0, cpu_sibling_map[0]); 1013 cpu_set(0, cpu_core_map[0]); 1014 return; 1015 } 1016 1017 verify_local_APIC(); 1018 1019 /* 1020 * If SMP should be disabled, then really disable it! 1021 */ 1022 if (!max_cpus) { 1023 smp_found_config = 0; 1024 printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); 1025 smpboot_clear_io_apic_irqs(); 1026 phys_cpu_present_map = physid_mask_of_physid(0); 1027 cpu_set(0, cpu_sibling_map[0]); 1028 cpu_set(0, cpu_core_map[0]); 1029 return; 1030 } 1031 1032 connect_bsp_APIC(); 1033 setup_local_APIC(); 1034 map_cpu_to_logical_apicid(); 1035 1036 1037 setup_portio_remap(); 1038 1039 /* 1040 * Scan the CPU present map and fire up the other CPUs via do_boot_cpu 1041 * 1042 * In clustered apic mode, phys_cpu_present_map is a constructed thus: 1043 * bits 0-3 are quad0, 4-7 are quad1, etc. A perverse twist on the 1044 * clustered apic ID. 1045 */ 1046 Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map)); 1047 1048 kicked = 1; 1049 for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) { 1050 apicid = cpu_present_to_apicid(bit); 1051 /* 1052 * Don't even attempt to start the boot CPU! 1053 */ 1054 if ((apicid == boot_cpu_apicid) || (apicid == BAD_APICID)) 1055 continue; 1056 1057 if (!check_apicid_present(bit)) 1058 continue; 1059 if (max_cpus <= cpucount+1) 1060 continue; 1061 1062 if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu)) 1063 printk("CPU #%d not responding - cannot use it.\n", 1064 apicid); 1065 else 1066 ++kicked; 1067 } 1068 1069 /* 1070 * Cleanup possible dangling ends... 1071 */ 1072 smpboot_restore_warm_reset_vector(); 1073 1074 /* 1075 * Allow the user to impress friends. 1076 */ 1077 Dprintk("Before bogomips.\n"); 1078 for (cpu = 0; cpu < NR_CPUS; cpu++) 1079 if (cpu_isset(cpu, cpu_callout_map)) 1080 bogosum += cpu_data[cpu].loops_per_jiffy; 1081 printk(KERN_INFO 1082 "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", 1083 cpucount+1, 1084 bogosum/(500000/HZ), 1085 (bogosum/(5000/HZ))%100); 1086 1087 Dprintk("Before bogocount - setting activated=1.\n"); 1088 1089 if (smp_b_stepping) 1090 printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n"); 1091 1092 /* 1093 * Don't taint if we are running SMP kernel on a single non-MP 1094 * approved Athlon 1095 */ 1096 if (tainted & TAINT_UNSAFE_SMP) { 1097 if (cpucount) 1098 printk (KERN_INFO "WARNING: This combination of AMD processors is not suitable for SMP.\n"); 1099 else 1100 tainted &= ~TAINT_UNSAFE_SMP; 1101 } 1102 1103 Dprintk("Boot done.\n"); 1104 1105 /* 1106 * construct cpu_sibling_map[], so that we can tell sibling CPUs 1107 * efficiently. 1108 */ 1109 for (cpu = 0; cpu < NR_CPUS; cpu++) { 1110 cpus_clear(cpu_sibling_map[cpu]); 1111 cpus_clear(cpu_core_map[cpu]); 1112 } 1113 1114 cpu_set(0, cpu_sibling_map[0]); 1115 cpu_set(0, cpu_core_map[0]); 1116 1117 smpboot_setup_io_apic(); 1118 1119 setup_boot_clock(); 1120} 1121 1122/* These are wrappers to interface to the new boot process. Someone 1123 who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ 1124void __init native_smp_prepare_cpus(unsigned int max_cpus) 1125{ 1126 smp_commenced_mask = cpumask_of_cpu(0); 1127 cpu_callin_map = cpumask_of_cpu(0); 1128 mb(); 1129 smp_boot_cpus(max_cpus); 1130} 1131 1132void __init native_smp_prepare_boot_cpu(void) 1133{ 1134 unsigned int cpu = smp_processor_id(); 1135 1136 init_gdt(cpu); 1137 switch_to_new_gdt(); 1138 1139 cpu_set(cpu, cpu_online_map); 1140 cpu_set(cpu, cpu_callout_map); 1141 cpu_set(cpu, cpu_present_map); 1142 cpu_set(cpu, cpu_possible_map); 1143 __get_cpu_var(cpu_state) = CPU_ONLINE; 1144} 1145 1146#ifdef CONFIG_HOTPLUG_CPU 1147static void 1148remove_siblinginfo(int cpu) 1149{ 1150 int sibling; 1151 struct cpuinfo_x86 *c = cpu_data; 1152 1153 for_each_cpu_mask(sibling, cpu_core_map[cpu]) { 1154 cpu_clear(cpu, cpu_core_map[sibling]); 1155 /* 1156 * last thread sibling in this cpu core going down 1157 */ 1158 if (cpus_weight(cpu_sibling_map[cpu]) == 1) 1159 c[sibling].booted_cores--; 1160 } 1161 1162 for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) 1163 cpu_clear(cpu, cpu_sibling_map[sibling]); 1164 cpus_clear(cpu_sibling_map[cpu]); 1165 cpus_clear(cpu_core_map[cpu]); 1166 c[cpu].phys_proc_id = 0; 1167 c[cpu].cpu_core_id = 0; 1168 cpu_clear(cpu, cpu_sibling_setup_map); 1169} 1170 1171int __cpu_disable(void) 1172{ 1173 cpumask_t map = cpu_online_map; 1174 int cpu = smp_processor_id(); 1175 1176 /* 1177 * Perhaps use cpufreq to drop frequency, but that could go 1178 * into generic code. 1179 * 1180 * We won't take down the boot processor on i386 due to some 1181 * interrupts only being able to be serviced by the BSP. 1182 * Especially so if we're not using an IOAPIC -zwane 1183 */ 1184 if (cpu == 0) 1185 return -EBUSY; 1186 if (nmi_watchdog == NMI_LOCAL_APIC) 1187 stop_apic_nmi_watchdog(NULL); 1188 clear_local_APIC(); 1189 /* Allow any queued timer interrupts to get serviced */ 1190 local_irq_enable(); 1191 mdelay(1); 1192 local_irq_disable(); 1193 1194 remove_siblinginfo(cpu); 1195 1196 cpu_clear(cpu, map); 1197 fixup_irqs(map); 1198 /* It's now safe to remove this processor from the online map */ 1199 cpu_clear(cpu, cpu_online_map); 1200 return 0; 1201} 1202 1203void __cpu_die(unsigned int cpu) 1204{ 1205 /* We don't do anything here: idle task is faking death itself. */ 1206 unsigned int i; 1207 1208 for (i = 0; i < 10; i++) { 1209 /* They ack this in play_dead by setting CPU_DEAD */ 1210 if (per_cpu(cpu_state, cpu) == CPU_DEAD) { 1211 printk ("CPU %d is now offline\n", cpu); 1212 if (1 == num_online_cpus()) 1213 alternatives_smp_switch(0); 1214 return; 1215 } 1216 msleep(100); 1217 } 1218 printk(KERN_ERR "CPU %u didn't die...\n", cpu); 1219} 1220#else /* ... !CONFIG_HOTPLUG_CPU */ 1221int __cpu_disable(void) 1222{ 1223 return -ENOSYS; 1224} 1225 1226void __cpu_die(unsigned int cpu) 1227{ 1228 /* We said "no" in __cpu_disable */ 1229 BUG(); 1230} 1231#endif /* CONFIG_HOTPLUG_CPU */ 1232 1233int __cpuinit native_cpu_up(unsigned int cpu) 1234{ 1235 unsigned long flags; 1236#ifdef CONFIG_HOTPLUG_CPU 1237 int ret = 0; 1238 1239 /* 1240 * We do warm boot only on cpus that had booted earlier 1241 * Otherwise cold boot is all handled from smp_boot_cpus(). 1242 * cpu_callin_map is set during AP kickstart process. Its reset 1243 * when a cpu is taken offline from cpu_exit_clear(). 1244 */ 1245 if (!cpu_isset(cpu, cpu_callin_map)) 1246 ret = __smp_prepare_cpu(cpu); 1247 1248 if (ret) 1249 return -EIO; 1250#endif 1251 1252 /* In case one didn't come up */ 1253 if (!cpu_isset(cpu, cpu_callin_map)) { 1254 printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu); 1255 return -EIO; 1256 } 1257 1258 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 1259 /* Unleash the CPU! */ 1260 cpu_set(cpu, smp_commenced_mask); 1261 1262 /* 1263 * Check TSC synchronization with the AP (keep irqs disabled 1264 * while doing so): 1265 */ 1266 local_irq_save(flags); 1267 check_tsc_sync_source(cpu); 1268 local_irq_restore(flags); 1269 1270 while (!cpu_isset(cpu, cpu_online_map)) { 1271 cpu_relax(); 1272 touch_nmi_watchdog(); 1273 } 1274 1275 return 0; 1276} 1277 1278void __init native_smp_cpus_done(unsigned int max_cpus) 1279{ 1280#ifdef CONFIG_X86_IO_APIC 1281 setup_ioapic_dest(); 1282#endif 1283 zap_low_mappings(); 1284#ifndef CONFIG_HOTPLUG_CPU 1285 /* 1286 * Disable executability of the SMP trampoline: 1287 */ 1288 set_kernel_exec((unsigned long)trampoline_base, trampoline_exec); 1289#endif 1290} 1291 1292void __init smp_intr_init(void) 1293{ 1294 /* 1295 * IRQ0 must be given a fixed assignment and initialized, 1296 * because it's used before the IO-APIC is set up. 1297 */ 1298 set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); 1299 1300 /* 1301 * The reschedule interrupt is a CPU-to-CPU reschedule-helper 1302 * IPI, driven by wakeup. 1303 */ 1304 set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); 1305 1306 /* IPI for invalidation */ 1307 set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); 1308 1309 /* IPI for generic function call */ 1310 set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 1311} 1312 1313/* 1314 * If the BIOS enumerates physical processors before logical, 1315 * maxcpus=N at enumeration-time can be used to disable HT. 1316 */ 1317static int __init parse_maxcpus(char *arg) 1318{ 1319 extern unsigned int maxcpus; 1320 1321 maxcpus = simple_strtoul(arg, NULL, 0); 1322 return 0; 1323} 1324early_param("maxcpus", parse_maxcpus); 1325