mp_machdep.c revision 196256
1/*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2008, by Kip Macy 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/i386/xen/mp_machdep.c 196256 2009-08-15 18:37:06Z attilio $"); 29 30#include "opt_apic.h" 31#include "opt_cpu.h" 32#include "opt_kstack_pages.h" 33#include "opt_mp_watchdog.h" 34#include "opt_sched.h" 35#include "opt_smp.h" 36 37#if !defined(lint) 38#if !defined(SMP) 39#error How did you get here? 40#endif 41 42#ifndef DEV_APIC 43#error The apic device is required for SMP, add "device apic" to your config file. 44#endif 45#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT) 46#error SMP not supported with CPU_DISABLE_CMPXCHG 47#endif 48#endif /* not lint */ 49 50#include <sys/param.h> 51#include <sys/systm.h> 52#include <sys/bus.h> 53#include <sys/cons.h> /* cngetc() */ 54#ifdef GPROF 55#include <sys/gmon.h> 56#endif 57#include <sys/kernel.h> 58#include <sys/ktr.h> 59#include <sys/lock.h> 60#include <sys/malloc.h> 61#include <sys/memrange.h> 62#include <sys/mutex.h> 63#include <sys/pcpu.h> 64#include <sys/proc.h> 65#include <sys/sched.h> 66#include <sys/smp.h> 67#include <sys/sysctl.h> 68 69#include <vm/vm.h> 70#include <vm/vm_param.h> 71#include <vm/pmap.h> 72#include <vm/vm_kern.h> 73#include <vm/vm_extern.h> 74#include <vm/vm_page.h> 75 76#include <machine/apicreg.h> 77#include <machine/md_var.h> 78#include <machine/mp_watchdog.h> 79#include <machine/pcb.h> 80#include <machine/psl.h> 81#include <machine/smp.h> 82#include <machine/specialreg.h> 83#include <machine/pcpu.h> 84 85 86 87#include <machine/xen/xen-os.h> 88#include <xen/evtchn.h> 89#include <xen/xen_intr.h> 90#include <xen/hypervisor.h> 91#include <xen/interface/vcpu.h> 92 93 94int mp_naps; /* # of Applications processors */ 95int boot_cpu_id = -1; /* designated BSP */ 96 97extern struct pcpu __pcpu[]; 98 99static int bootAP; 100static union descriptor *bootAPgdt; 101 102static char resched_name[NR_CPUS][15]; 103static char callfunc_name[NR_CPUS][15]; 104 105/* Free these after use */ 106void *bootstacks[MAXCPU]; 107 108/* Hotwire a 0->4MB V==P mapping */ 109extern pt_entry_t *KPTphys; 110 111struct pcb stoppcbs[MAXCPU]; 112 113/* Variables needed for SMP tlb shootdown. */ 114vm_offset_t smp_tlb_addr1; 115vm_offset_t smp_tlb_addr2; 116volatile int smp_tlb_wait; 117 118typedef void call_data_func_t(uintptr_t , uintptr_t); 119 120static u_int logical_cpus; 121static volatile cpumask_t ipi_nmi_pending; 122 123/* used to hold the AP's until we are ready to release them */ 124static struct mtx ap_boot_mtx; 125 126/* Set to 1 once we're ready to let the APs out of the pen. */ 127static volatile int aps_ready = 0; 128 129/* 130 * Store data from cpu_add() until later in the boot when we actually setup 131 * the APs. 132 */ 133struct cpu_info { 134 int cpu_present:1; 135 int cpu_bsp:1; 136 int cpu_disabled:1; 137} static cpu_info[MAX_APIC_ID + 1]; 138int cpu_apic_ids[MAXCPU]; 139int apic_cpuids[MAX_APIC_ID + 1]; 140 141/* Holds pending bitmap based IPIs per CPU */ 142static volatile u_int cpu_ipi_pending[MAXCPU]; 143 144static int cpu_logical; 145static int cpu_cores; 146 147static void assign_cpu_ids(void); 148static void set_interrupt_apic_ids(void); 149int start_all_aps(void); 150static int start_ap(int apic_id); 151static void release_aps(void *dummy); 152 153static u_int hyperthreading_cpus; 154static cpumask_t hyperthreading_cpus_mask; 155 156extern void Xhypervisor_callback(void); 157extern void failsafe_callback(void); 158extern void pmap_lazyfix_action(void); 159 160struct cpu_group * 161cpu_topo(void) 162{ 163 if (cpu_cores == 0) 164 cpu_cores = 1; 165 if (cpu_logical == 0) 166 cpu_logical = 1; 167 if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { 168 printf("WARNING: Non-uniform processors.\n"); 169 printf("WARNING: Using suboptimal topology.\n"); 170 return (smp_topo_none()); 171 } 172 /* 173 * No multi-core or hyper-threaded. 174 */ 175 if (cpu_logical * cpu_cores == 1) 176 return (smp_topo_none()); 177 /* 178 * Only HTT no multi-core. 179 */ 180 if (cpu_logical > 1 && cpu_cores == 1) 181 return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); 182 /* 183 * Only multi-core no HTT. 184 */ 185 if (cpu_cores > 1 && cpu_logical == 1) 186 return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0)); 187 /* 188 * Both HTT and multi-core. 189 */ 190 return (smp_topo_2level(CG_SHARE_NONE, cpu_cores, 191 CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); 192} 193 194/* 195 * Calculate usable address in base memory for AP trampoline code. 196 */ 197u_int 198mp_bootaddress(u_int basemem) 199{ 200 201 return (basemem); 202} 203 204void 205cpu_add(u_int apic_id, char boot_cpu) 206{ 207 208 if (apic_id > MAX_APIC_ID) { 209 panic("SMP: APIC ID %d too high", apic_id); 210 return; 211 } 212 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", 213 apic_id)); 214 cpu_info[apic_id].cpu_present = 1; 215 if (boot_cpu) { 216 KASSERT(boot_cpu_id == -1, 217 ("CPU %d claims to be BSP, but CPU %d already is", apic_id, 218 boot_cpu_id)); 219 boot_cpu_id = apic_id; 220 cpu_info[apic_id].cpu_bsp = 1; 221 } 222 if (mp_ncpus < MAXCPU) 223 mp_ncpus++; 224 if (bootverbose) 225 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : 226 "AP"); 227} 228 229void 230cpu_mp_setmaxid(void) 231{ 232 233 mp_maxid = MAXCPU - 1; 234} 235 236int 237cpu_mp_probe(void) 238{ 239 240 /* 241 * Always record BSP in CPU map so that the mbuf init code works 242 * correctly. 243 */ 244 all_cpus = 1; 245 if (mp_ncpus == 0) { 246 /* 247 * No CPUs were found, so this must be a UP system. Setup 248 * the variables to represent a system with a single CPU 249 * with an id of 0. 250 */ 251 mp_ncpus = 1; 252 return (0); 253 } 254 255 /* At least one CPU was found. */ 256 if (mp_ncpus == 1) { 257 /* 258 * One CPU was found, so this must be a UP system with 259 * an I/O APIC. 260 */ 261 return (0); 262 } 263 264 /* At least two CPUs were found. */ 265 return (1); 266} 267 268/* 269 * Initialize the IPI handlers and start up the AP's. 270 */ 271void 272cpu_mp_start(void) 273{ 274 int i; 275 276 /* Initialize the logical ID to APIC ID table. */ 277 for (i = 0; i < MAXCPU; i++) { 278 cpu_apic_ids[i] = -1; 279 cpu_ipi_pending[i] = 0; 280 } 281 282 /* Set boot_cpu_id if needed. */ 283 if (boot_cpu_id == -1) { 284 boot_cpu_id = PCPU_GET(apic_id); 285 cpu_info[boot_cpu_id].cpu_bsp = 1; 286 } else 287 KASSERT(boot_cpu_id == PCPU_GET(apic_id), 288 ("BSP's APIC ID doesn't match boot_cpu_id")); 289 cpu_apic_ids[0] = boot_cpu_id; 290 apic_cpuids[boot_cpu_id] = 0; 291 292 assign_cpu_ids(); 293 294 /* Start each Application Processor */ 295 start_all_aps(); 296 297 /* Setup the initial logical CPUs info. */ 298 logical_cpus = logical_cpus_mask = 0; 299 if (cpu_feature & CPUID_HTT) 300 logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16; 301 302 set_interrupt_apic_ids(); 303} 304 305 306static void 307iv_rendezvous(uintptr_t a, uintptr_t b) 308{ 309 smp_rendezvous_action(); 310} 311 312static void 313iv_invltlb(uintptr_t a, uintptr_t b) 314{ 315 xen_tlb_flush(); 316} 317 318static void 319iv_invlpg(uintptr_t a, uintptr_t b) 320{ 321 xen_invlpg(a); 322} 323 324static void 325iv_invlrng(uintptr_t a, uintptr_t b) 326{ 327 vm_offset_t start = (vm_offset_t)a; 328 vm_offset_t end = (vm_offset_t)b; 329 330 while (start < end) { 331 xen_invlpg(start); 332 start += PAGE_SIZE; 333 } 334} 335 336 337static void 338iv_invlcache(uintptr_t a, uintptr_t b) 339{ 340 341 wbinvd(); 342 atomic_add_int(&smp_tlb_wait, 1); 343} 344 345static void 346iv_lazypmap(uintptr_t a, uintptr_t b) 347{ 348 pmap_lazyfix_action(); 349 atomic_add_int(&smp_tlb_wait, 1); 350} 351 352/* 353 * These start from "IPI offset" APIC_IPI_INTS 354 */ 355static call_data_func_t *ipi_vectors[6] = 356{ 357 iv_rendezvous, 358 iv_invltlb, 359 iv_invlpg, 360 iv_invlrng, 361 iv_invlcache, 362 iv_lazypmap, 363}; 364 365/* 366 * Reschedule call back. Nothing to do, 367 * all the work is done automatically when 368 * we return from the interrupt. 369 */ 370static int 371smp_reschedule_interrupt(void *unused) 372{ 373 int cpu = PCPU_GET(cpuid); 374 u_int ipi_bitmap; 375 376 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 377 378 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 379#ifdef COUNT_IPIS 380 (*ipi_preempt_counts[cpu])++; 381#endif 382 sched_preempt(curthread); 383 } 384 385 if (ipi_bitmap & (1 << IPI_AST)) { 386#ifdef COUNT_IPIS 387 (*ipi_ast_counts[cpu])++; 388#endif 389 /* Nothing to do for AST */ 390 } 391 return (FILTER_HANDLED); 392} 393 394struct _call_data { 395 uint16_t func_id; 396 uint16_t wait; 397 uintptr_t arg1; 398 uintptr_t arg2; 399 atomic_t started; 400 atomic_t finished; 401}; 402 403static struct _call_data *call_data; 404 405static int 406smp_call_function_interrupt(void *unused) 407{ 408 call_data_func_t *func; 409 uintptr_t arg1 = call_data->arg1; 410 uintptr_t arg2 = call_data->arg2; 411 int wait = call_data->wait; 412 atomic_t *started = &call_data->started; 413 atomic_t *finished = &call_data->finished; 414 415 /* We only handle function IPIs, not bitmap IPIs */ 416 if (call_data->func_id < APIC_IPI_INTS || call_data->func_id > IPI_BITMAP_VECTOR) 417 panic("invalid function id %u", call_data->func_id); 418 419 func = ipi_vectors[call_data->func_id - APIC_IPI_INTS]; 420 /* 421 * Notify initiating CPU that I've grabbed the data and am 422 * about to execute the function 423 */ 424 mb(); 425 atomic_inc(started); 426 /* 427 * At this point the info structure may be out of scope unless wait==1 428 */ 429 (*func)(arg1, arg2); 430 431 if (wait) { 432 mb(); 433 atomic_inc(finished); 434 } 435 atomic_add_int(&smp_tlb_wait, 1); 436 return (FILTER_HANDLED); 437} 438 439/* 440 * Print various information about the SMP system hardware and setup. 441 */ 442void 443cpu_mp_announce(void) 444{ 445 int i, x; 446 447 /* List CPUs */ 448 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); 449 for (i = 1, x = 0; x <= MAX_APIC_ID; x++) { 450 if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp) 451 continue; 452 if (cpu_info[x].cpu_disabled) 453 printf(" cpu (AP): APIC ID: %2d (disabled)\n", x); 454 else { 455 KASSERT(i < mp_ncpus, 456 ("mp_ncpus and actual cpus are out of whack")); 457 printf(" cpu%d (AP): APIC ID: %2d\n", i++, x); 458 } 459 } 460} 461 462static int 463xen_smp_intr_init(unsigned int cpu) 464{ 465 int rc; 466 unsigned int irq; 467 468 per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1; 469 470 sprintf(resched_name[cpu], "resched%u", cpu); 471 rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR, 472 cpu, 473 resched_name[cpu], 474 smp_reschedule_interrupt, 475 INTR_FAST|INTR_TYPE_TTY|INTR_MPSAFE, &irq); 476 477 printf("[XEN] IPI cpu=%d irq=%d vector=RESCHEDULE_VECTOR (%d)\n", 478 cpu, irq, RESCHEDULE_VECTOR); 479 480 per_cpu(resched_irq, cpu) = irq; 481 482 sprintf(callfunc_name[cpu], "callfunc%u", cpu); 483 rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR, 484 cpu, 485 callfunc_name[cpu], 486 smp_call_function_interrupt, 487 INTR_FAST|INTR_TYPE_TTY|INTR_MPSAFE, &irq); 488 if (rc < 0) 489 goto fail; 490 per_cpu(callfunc_irq, cpu) = irq; 491 492 printf("[XEN] IPI cpu=%d irq=%d vector=CALL_FUNCTION_VECTOR (%d)\n", 493 cpu, irq, CALL_FUNCTION_VECTOR); 494 495 496 if ((cpu != 0) && ((rc = ap_cpu_initclocks(cpu)) != 0)) 497 goto fail; 498 499 return 0; 500 501 fail: 502 if (per_cpu(resched_irq, cpu) >= 0) 503 unbind_from_irqhandler(per_cpu(resched_irq, cpu)); 504 if (per_cpu(callfunc_irq, cpu) >= 0) 505 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu)); 506 return rc; 507} 508 509static void 510xen_smp_intr_init_cpus(void *unused) 511{ 512 int i; 513 514 for (i = 0; i < mp_ncpus; i++) 515 xen_smp_intr_init(i); 516} 517 518#define MTOPSIZE (1<<(14 + PAGE_SHIFT)) 519 520/* 521 * AP CPU's call this to initialize themselves. 522 */ 523void 524init_secondary(void) 525{ 526 vm_offset_t addr; 527 int gsel_tss; 528 529 530 /* bootAP is set in start_ap() to our ID. */ 531 PCPU_SET(currentldt, _default_ldt); 532 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 533#if 0 534 gdt[bootAP * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; 535#endif 536 PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ 537 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); 538 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); 539#if 0 540 PCPU_SET(tss_gdt, &gdt[bootAP * NGDT + GPROC0_SEL].sd); 541 542 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); 543#endif 544 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 545 546 /* 547 * Set to a known state: 548 * Set by mpboot.s: CR0_PG, CR0_PE 549 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM 550 */ 551 /* 552 * signal our startup to the BSP. 553 */ 554 mp_naps++; 555 556 /* Spin until the BSP releases the AP's. */ 557 while (!aps_ready) 558 ia32_pause(); 559 560 /* BSP may have changed PTD while we were waiting */ 561 invltlb(); 562 for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE) 563 invlpg(addr); 564 565 /* set up FPU state on the AP */ 566 npxinit(); 567#if 0 568 569 /* set up SSE registers */ 570 enable_sse(); 571#endif 572#if 0 && defined(PAE) 573 /* Enable the PTE no-execute bit. */ 574 if ((amd_feature & AMDID_NX) != 0) { 575 uint64_t msr; 576 577 msr = rdmsr(MSR_EFER) | EFER_NXE; 578 wrmsr(MSR_EFER, msr); 579 } 580#endif 581#if 0 582 /* A quick check from sanity claus */ 583 if (PCPU_GET(apic_id) != lapic_id()) { 584 printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); 585 printf("SMP: actual apic_id = %d\n", lapic_id()); 586 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 587 panic("cpuid mismatch! boom!!"); 588 } 589#endif 590 591 /* Initialize curthread. */ 592 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 593 PCPU_SET(curthread, PCPU_GET(idlethread)); 594 595 mtx_lock_spin(&ap_boot_mtx); 596#if 0 597 598 /* Init local apic for irq's */ 599 lapic_setup(1); 600#endif 601 smp_cpus++; 602 603 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid)); 604 printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); 605 606 /* Determine if we are a logical CPU. */ 607 if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0) 608 logical_cpus_mask |= PCPU_GET(cpumask); 609 610 /* Determine if we are a hyperthread. */ 611 if (hyperthreading_cpus > 1 && 612 PCPU_GET(apic_id) % hyperthreading_cpus != 0) 613 hyperthreading_cpus_mask |= PCPU_GET(cpumask); 614 615 /* Build our map of 'other' CPUs. */ 616 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); 617#if 0 618 if (bootverbose) 619 lapic_dump("AP"); 620#endif 621 if (smp_cpus == mp_ncpus) { 622 /* enable IPI's, tlb shootdown, freezes etc */ 623 atomic_store_rel_int(&smp_started, 1); 624 smp_active = 1; /* historic */ 625 } 626 627 mtx_unlock_spin(&ap_boot_mtx); 628 629 /* wait until all the AP's are up */ 630 while (smp_started == 0) 631 ia32_pause(); 632 633 634 PCPU_SET(curthread, PCPU_GET(idlethread)); 635 /* enter the scheduler */ 636 sched_throw(NULL); 637 638 panic("scheduler returned us to %s", __func__); 639 /* NOTREACHED */ 640} 641 642/******************************************************************* 643 * local functions and data 644 */ 645 646/* 647 * We tell the I/O APIC code about all the CPUs we want to receive 648 * interrupts. If we don't want certain CPUs to receive IRQs we 649 * can simply not tell the I/O APIC code about them in this function. 650 * We also do not tell it about the BSP since it tells itself about 651 * the BSP internally to work with UP kernels and on UP machines. 652 */ 653static void 654set_interrupt_apic_ids(void) 655{ 656 u_int i, apic_id; 657 658 for (i = 0; i < MAXCPU; i++) { 659 apic_id = cpu_apic_ids[i]; 660 if (apic_id == -1) 661 continue; 662 if (cpu_info[apic_id].cpu_bsp) 663 continue; 664 if (cpu_info[apic_id].cpu_disabled) 665 continue; 666 667 /* Don't let hyperthreads service interrupts. */ 668 if (hyperthreading_cpus > 1 && 669 apic_id % hyperthreading_cpus != 0) 670 continue; 671 672 intr_add_cpu(i); 673 } 674} 675 676/* 677 * Assign logical CPU IDs to local APICs. 678 */ 679static void 680assign_cpu_ids(void) 681{ 682 u_int i; 683 684 /* Check for explicitly disabled CPUs. */ 685 for (i = 0; i <= MAX_APIC_ID; i++) { 686 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) 687 continue; 688 689 /* Don't use this CPU if it has been disabled by a tunable. */ 690 if (resource_disabled("lapic", i)) { 691 cpu_info[i].cpu_disabled = 1; 692 continue; 693 } 694 } 695 696 /* 697 * Assign CPU IDs to local APIC IDs and disable any CPUs 698 * beyond MAXCPU. CPU 0 has already been assigned to the BSP, 699 * so we only have to assign IDs for APs. 700 */ 701 mp_ncpus = 1; 702 for (i = 0; i <= MAX_APIC_ID; i++) { 703 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || 704 cpu_info[i].cpu_disabled) 705 continue; 706 707 if (mp_ncpus < MAXCPU) { 708 cpu_apic_ids[mp_ncpus] = i; 709 apic_cpuids[i] = mp_ncpus; 710 mp_ncpus++; 711 } else 712 cpu_info[i].cpu_disabled = 1; 713 } 714 KASSERT(mp_maxid >= mp_ncpus - 1, 715 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 716 mp_ncpus)); 717} 718 719/* 720 * start each AP in our list 721 */ 722/* Lowest 1MB is already mapped: don't touch*/ 723#define TMPMAP_START 1 724int 725start_all_aps(void) 726{ 727 int x,apic_id, cpu; 728 struct pcpu *pc; 729 730 mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); 731 732 /* set up temporary P==V mapping for AP boot */ 733 /* XXX this is a hack, we should boot the AP on its own stack/PTD */ 734 735 /* start each AP */ 736 for (cpu = 1; cpu < mp_ncpus; cpu++) { 737 apic_id = cpu_apic_ids[cpu]; 738 739 740 bootAP = cpu; 741 bootAPgdt = gdt + (512*cpu); 742 743 /* Get per-cpu data */ 744 pc = &__pcpu[bootAP]; 745 pcpu_init(pc, bootAP, sizeof(struct pcpu)); 746 dpcpu_init((void *)kmem_alloc(kernel_map, DPCPU_SIZE), bootAP); 747 pc->pc_apic_id = cpu_apic_ids[bootAP]; 748 pc->pc_prvspace = pc; 749 pc->pc_curthread = 0; 750 751 gdt_segs[GPRIV_SEL].ssd_base = (int) pc; 752 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; 753 754 PT_SET_MA(bootAPgdt, xpmap_ptom(VTOP(bootAPgdt)) | PG_V | PG_RW); 755 bzero(bootAPgdt, PAGE_SIZE); 756 for (x = 0; x < NGDT; x++) 757 ssdtosd(&gdt_segs[x], &bootAPgdt[x].sd); 758 PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V); 759#ifdef notyet 760 761 if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) { 762 apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id); 763 acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id); 764#ifdef CONFIG_ACPI 765 if (acpiid != 0xff) 766 x86_acpiid_to_apicid[acpiid] = apicid; 767#endif 768 } 769#endif 770 771 /* attempt to start the Application Processor */ 772 if (!start_ap(cpu)) { 773 printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id); 774 /* better panic as the AP may be running loose */ 775 printf("panic y/n? [y] "); 776 if (cngetc() != 'n') 777 panic("bye-bye"); 778 } 779 780 all_cpus |= (1 << cpu); /* record AP in CPU map */ 781 } 782 783 784 /* build our map of 'other' CPUs */ 785 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); 786 787 pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); 788 789 /* number of APs actually started */ 790 return mp_naps; 791} 792 793extern uint8_t *pcpu_boot_stack; 794extern trap_info_t trap_table[]; 795 796static void 797smp_trap_init(trap_info_t *trap_ctxt) 798{ 799 const trap_info_t *t = trap_table; 800 801 for (t = trap_table; t->address; t++) { 802 trap_ctxt[t->vector].flags = t->flags; 803 trap_ctxt[t->vector].cs = t->cs; 804 trap_ctxt[t->vector].address = t->address; 805 } 806} 807 808extern int nkpt; 809static void 810cpu_initialize_context(unsigned int cpu) 811{ 812 /* vcpu_guest_context_t is too large to allocate on the stack. 813 * Hence we allocate statically and protect it with a lock */ 814 vm_page_t m[4]; 815 static vcpu_guest_context_t ctxt; 816 vm_offset_t boot_stack; 817 vm_offset_t newPTD; 818 vm_paddr_t ma[NPGPTD]; 819 static int color; 820 int i; 821 822 /* 823 * Page 0,[0-3] PTD 824 * Page 1, [4] boot stack 825 * Page [5] PDPT 826 * 827 */ 828 for (i = 0; i < NPGPTD + 2; i++) { 829 m[i] = vm_page_alloc(NULL, color++, 830 VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 831 VM_ALLOC_ZERO); 832 833 pmap_zero_page(m[i]); 834 835 } 836 boot_stack = kmem_alloc_nofault(kernel_map, 1); 837 newPTD = kmem_alloc_nofault(kernel_map, NPGPTD); 838 ma[0] = xpmap_ptom(VM_PAGE_TO_PHYS(m[0]))|PG_V; 839 840#ifdef PAE 841 pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD + 1])); 842 for (i = 0; i < NPGPTD; i++) { 843 ((vm_paddr_t *)boot_stack)[i] = 844 ma[i] = 845 xpmap_ptom(VM_PAGE_TO_PHYS(m[i]))|PG_V; 846 } 847#endif 848 849 /* 850 * Copy cpu0 IdlePTD to new IdlePTD - copying only 851 * kernel mappings 852 */ 853 pmap_qenter(newPTD, m, 4); 854 855 memcpy((uint8_t *)newPTD + KPTDI*sizeof(vm_paddr_t), 856 (uint8_t *)PTOV(IdlePTD) + KPTDI*sizeof(vm_paddr_t), 857 nkpt*sizeof(vm_paddr_t)); 858 859 pmap_qremove(newPTD, 4); 860 kmem_free(kernel_map, newPTD, 4); 861 /* 862 * map actual idle stack to boot_stack 863 */ 864 pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD])); 865 866 867 xen_pgdpt_pin(xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1]))); 868 vm_page_lock_queues(); 869 for (i = 0; i < 4; i++) { 870 int pdir = (PTDPTDI + i) / NPDEPG; 871 int curoffset = (PTDPTDI + i) % NPDEPG; 872 873 xen_queue_pt_update((vm_paddr_t) 874 ((ma[pdir] & ~PG_V) + (curoffset*sizeof(vm_paddr_t))), 875 ma[i]); 876 } 877 PT_UPDATES_FLUSH(); 878 vm_page_unlock_queues(); 879 880 memset(&ctxt, 0, sizeof(ctxt)); 881 ctxt.flags = VGCF_IN_KERNEL; 882 ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL); 883 ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL); 884 ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL); 885 ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL); 886 ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL); 887 ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL); 888 ctxt.user_regs.eip = (unsigned long)init_secondary; 889 ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */ 890 891 memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); 892 893 smp_trap_init(ctxt.trap_ctxt); 894 895 ctxt.ldt_ents = 0; 896 ctxt.gdt_frames[0] = (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT); 897 ctxt.gdt_ents = 512; 898 899#ifdef __i386__ 900 ctxt.user_regs.esp = boot_stack + PAGE_SIZE; 901 902 ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); 903 ctxt.kernel_sp = boot_stack + PAGE_SIZE; 904 905 ctxt.event_callback_cs = GSEL(GCODE_SEL, SEL_KPL); 906 ctxt.event_callback_eip = (unsigned long)Xhypervisor_callback; 907 ctxt.failsafe_callback_cs = GSEL(GCODE_SEL, SEL_KPL); 908 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; 909 910 ctxt.ctrlreg[3] = xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1])); 911#else /* __x86_64__ */ 912 ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs); 913 ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); 914 ctxt.kernel_sp = idle->thread.rsp0; 915 916 ctxt.event_callback_eip = (unsigned long)hypervisor_callback; 917 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; 918 ctxt.syscall_callback_eip = (unsigned long)system_call; 919 920 ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt)); 921 922 ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu)); 923#endif 924 925 printf("gdtpfn=%lx pdptpfn=%lx\n", 926 ctxt.gdt_frames[0], 927 ctxt.ctrlreg[3] >> PAGE_SHIFT); 928 929 PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt)); 930 DELAY(3000); 931 PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)); 932} 933 934/* 935 * This function starts the AP (application processor) identified 936 * by the APIC ID 'physicalCpu'. It does quite a "song and dance" 937 * to accomplish this. This is necessary because of the nuances 938 * of the different hardware we might encounter. It isn't pretty, 939 * but it seems to work. 940 */ 941 942int cpus; 943static int 944start_ap(int apic_id) 945{ 946 int ms; 947 948 /* used as a watchpoint to signal AP startup */ 949 cpus = mp_naps; 950 951 cpu_initialize_context(apic_id); 952 953 /* Wait up to 5 seconds for it to start. */ 954 for (ms = 0; ms < 5000; ms++) { 955 if (mp_naps > cpus) 956 return 1; /* return SUCCESS */ 957 DELAY(1000); 958 } 959 return 0; /* return FAILURE */ 960} 961 962/* 963 * Flush the TLB on all other CPU's 964 */ 965static void 966smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) 967{ 968 u_int ncpu; 969 struct _call_data data; 970 971 ncpu = mp_ncpus - 1; /* does not shootdown self */ 972 if (ncpu < 1) 973 return; /* no other cpus */ 974 if (!(read_eflags() & PSL_I)) 975 panic("%s: interrupts disabled", __func__); 976 mtx_lock_spin(&smp_ipi_mtx); 977 KASSERT(call_data == NULL, ("call_data isn't null?!")); 978 call_data = &data; 979 call_data->func_id = vector; 980 call_data->arg1 = addr1; 981 call_data->arg2 = addr2; 982 atomic_store_rel_int(&smp_tlb_wait, 0); 983 ipi_all_but_self(vector); 984 while (smp_tlb_wait < ncpu) 985 ia32_pause(); 986 call_data = NULL; 987 mtx_unlock_spin(&smp_ipi_mtx); 988} 989 990static void 991smp_targeted_tlb_shootdown(cpumask_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) 992{ 993 int ncpu, othercpus; 994 struct _call_data data; 995 996 othercpus = mp_ncpus - 1; 997 if (mask == (u_int)-1) { 998 ncpu = othercpus; 999 if (ncpu < 1) 1000 return; 1001 } else { 1002 mask &= ~PCPU_GET(cpumask); 1003 if (mask == 0) 1004 return; 1005 ncpu = bitcount32(mask); 1006 if (ncpu > othercpus) { 1007 /* XXX this should be a panic offence */ 1008 printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", 1009 ncpu, othercpus); 1010 ncpu = othercpus; 1011 } 1012 /* XXX should be a panic, implied by mask == 0 above */ 1013 if (ncpu < 1) 1014 return; 1015 } 1016 if (!(read_eflags() & PSL_I)) 1017 panic("%s: interrupts disabled", __func__); 1018 mtx_lock_spin(&smp_ipi_mtx); 1019 KASSERT(call_data == NULL, ("call_data isn't null?!")); 1020 call_data = &data; 1021 call_data->func_id = vector; 1022 call_data->arg1 = addr1; 1023 call_data->arg2 = addr2; 1024 atomic_store_rel_int(&smp_tlb_wait, 0); 1025 if (mask == (u_int)-1) 1026 ipi_all_but_self(vector); 1027 else 1028 ipi_selected(mask, vector); 1029 while (smp_tlb_wait < ncpu) 1030 ia32_pause(); 1031 call_data = NULL; 1032 mtx_unlock_spin(&smp_ipi_mtx); 1033} 1034 1035void 1036smp_cache_flush(void) 1037{ 1038 1039 if (smp_started) 1040 smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); 1041} 1042 1043void 1044smp_invltlb(void) 1045{ 1046 1047 if (smp_started) { 1048 smp_tlb_shootdown(IPI_INVLTLB, 0, 0); 1049 } 1050} 1051 1052void 1053smp_invlpg(vm_offset_t addr) 1054{ 1055 1056 if (smp_started) { 1057 smp_tlb_shootdown(IPI_INVLPG, addr, 0); 1058 } 1059} 1060 1061void 1062smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) 1063{ 1064 1065 if (smp_started) { 1066 smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); 1067 } 1068} 1069 1070void 1071smp_masked_invltlb(cpumask_t mask) 1072{ 1073 1074 if (smp_started) { 1075 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); 1076 } 1077} 1078 1079void 1080smp_masked_invlpg(cpumask_t mask, vm_offset_t addr) 1081{ 1082 1083 if (smp_started) { 1084 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); 1085 } 1086} 1087 1088void 1089smp_masked_invlpg_range(cpumask_t mask, vm_offset_t addr1, vm_offset_t addr2) 1090{ 1091 1092 if (smp_started) { 1093 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); 1094 } 1095} 1096 1097/* 1098 * send an IPI to a set of cpus. 1099 */ 1100void 1101ipi_selected(cpumask_t cpus, u_int ipi) 1102{ 1103 int cpu; 1104 u_int bitmap = 0; 1105 u_int old_pending; 1106 u_int new_pending; 1107 1108 if (IPI_IS_BITMAPED(ipi)) { 1109 bitmap = 1 << ipi; 1110 ipi = IPI_BITMAP_VECTOR; 1111 } 1112 1113 /* 1114 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1115 * of help in order to understand what is the source. 1116 * Set the mask of receiving CPUs for this purpose. 1117 */ 1118 if (ipi == IPI_STOP_HARD) 1119 atomic_set_int(&ipi_nmi_pending, cpus); 1120 1121 CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi); 1122 while ((cpu = ffs(cpus)) != 0) { 1123 cpu--; 1124 cpus &= ~(1 << cpu); 1125 1126 KASSERT(cpu_apic_ids[cpu] != -1, 1127 ("IPI to non-existent CPU %d", cpu)); 1128 1129 if (bitmap) { 1130 do { 1131 old_pending = cpu_ipi_pending[cpu]; 1132 new_pending = old_pending | bitmap; 1133 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu],old_pending, new_pending)); 1134 1135 if (!old_pending) 1136 ipi_pcpu(cpu, RESCHEDULE_VECTOR); 1137 continue; 1138 1139 } else { 1140 KASSERT(call_data != NULL, ("call_data not set")); 1141 ipi_pcpu(cpu, CALL_FUNCTION_VECTOR); 1142 } 1143 } 1144} 1145 1146/* 1147 * send an IPI to all CPUs EXCEPT myself 1148 */ 1149void 1150ipi_all_but_self(u_int ipi) 1151{ 1152 1153 /* 1154 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1155 * of help in order to understand what is the source. 1156 * Set the mask of receiving CPUs for this purpose. 1157 */ 1158 if (ipi == IPI_STOP_HARD) 1159 atomic_set_int(&ipi_nmi_pending, PCPU_GET(other_cpus)); 1160 1161 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1162 ipi_selected(PCPU_GET(other_cpus), ipi); 1163} 1164 1165int 1166ipi_nmi_handler() 1167{ 1168 cpumask_t cpumask; 1169 1170 /* 1171 * As long as there is not a simple way to know about a NMI's 1172 * source, if the bitmask for the current CPU is present in 1173 * the global pending bitword an IPI_STOP_HARD has been issued 1174 * and should be handled. 1175 */ 1176 cpumask = PCPU_GET(cpumask); 1177 if ((ipi_nmi_pending & cpumask) == 0) 1178 return (1); 1179 1180 atomic_clear_int(&ipi_nmi_pending, cpumask); 1181 cpustop_handler(); 1182 return (0); 1183} 1184 1185/* 1186 * Handle an IPI_STOP by saving our current context and spinning until we 1187 * are resumed. 1188 */ 1189void 1190cpustop_handler(void) 1191{ 1192 int cpu = PCPU_GET(cpuid); 1193 int cpumask = PCPU_GET(cpumask); 1194 1195 savectx(&stoppcbs[cpu]); 1196 1197 /* Indicate that we are stopped */ 1198 atomic_set_int(&stopped_cpus, cpumask); 1199 1200 /* Wait for restart */ 1201 while (!(started_cpus & cpumask)) 1202 ia32_pause(); 1203 1204 atomic_clear_int(&started_cpus, cpumask); 1205 atomic_clear_int(&stopped_cpus, cpumask); 1206 1207 if (cpu == 0 && cpustop_restartfunc != NULL) { 1208 cpustop_restartfunc(); 1209 cpustop_restartfunc = NULL; 1210 } 1211} 1212 1213/* 1214 * This is called once the rest of the system is up and running and we're 1215 * ready to let the AP's out of the pen. 1216 */ 1217static void 1218release_aps(void *dummy __unused) 1219{ 1220 1221 if (mp_ncpus == 1) 1222 return; 1223 atomic_store_rel_int(&aps_ready, 1); 1224 while (smp_started == 0) 1225 ia32_pause(); 1226} 1227SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1228SYSINIT(start_ipis, SI_SUB_INTR, SI_ORDER_ANY, xen_smp_intr_init_cpus, NULL); 1229 1230