mp_machdep.c revision 184112
1/*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2008, by Kip Macy 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/i386/xen/mp_machdep.c 184112 2008-10-21 06:39:40Z kmacy $"); 29 30#include "opt_apic.h" 31#include "opt_cpu.h" 32#include "opt_kstack_pages.h" 33#include "opt_mp_watchdog.h" 34#include "opt_sched.h" 35#include "opt_smp.h" 36 37#if !defined(lint) 38#if !defined(SMP) 39#error How did you get here? 40#endif 41 42#ifndef DEV_APIC 43#error The apic device is required for SMP, add "device apic" to your config file. 44#endif 45#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT) 46#error SMP not supported with CPU_DISABLE_CMPXCHG 47#endif 48#endif /* not lint */ 49 50#include <sys/param.h> 51#include <sys/systm.h> 52#include <sys/bus.h> 53#include <sys/cons.h> /* cngetc() */ 54#ifdef GPROF 55#include <sys/gmon.h> 56#endif 57#include <sys/kernel.h> 58#include <sys/ktr.h> 59#include <sys/lock.h> 60#include <sys/malloc.h> 61#include <sys/memrange.h> 62#include <sys/mutex.h> 63#include <sys/pcpu.h> 64#include <sys/proc.h> 65#include <sys/sched.h> 66#include <sys/smp.h> 67#include <sys/sysctl.h> 68 69#include <vm/vm.h> 70#include <vm/vm_param.h> 71#include <vm/pmap.h> 72#include <vm/vm_kern.h> 73#include <vm/vm_extern.h> 74#include <vm/vm_page.h> 75 76#include <machine/apicreg.h> 77#include <machine/md_var.h> 78#include <machine/mp_watchdog.h> 79#include <machine/pcb.h> 80#include <machine/psl.h> 81#include <machine/smp.h> 82#include <machine/specialreg.h> 83#include <machine/pcpu.h> 84 85 86 87#include <machine/xen/xen-os.h> 88#include <machine/xen/evtchn.h> 89#include <machine/xen/xen_intr.h> 90#include <machine/xen/hypervisor.h> 91#include <xen/interface/vcpu.h> 92 93#define stop_cpus_with_nmi 0 94 95 96int mp_naps; /* # of Applications processors */ 97int boot_cpu_id = -1; /* designated BSP */ 98 99extern struct pcpu __pcpu[]; 100 101static int bootAP; 102static union descriptor *bootAPgdt; 103 104static DEFINE_PER_CPU(int, resched_irq); 105static DEFINE_PER_CPU(int, callfunc_irq); 106static char resched_name[NR_CPUS][15]; 107static char callfunc_name[NR_CPUS][15]; 108 109/* Free these after use */ 110void *bootstacks[MAXCPU]; 111 112/* Hotwire a 0->4MB V==P mapping */ 113extern pt_entry_t *KPTphys; 114 115struct pcb stoppcbs[MAXCPU]; 116 117/* Variables needed for SMP tlb shootdown. */ 118vm_offset_t smp_tlb_addr1; 119vm_offset_t smp_tlb_addr2; 120volatile int smp_tlb_wait; 121 122typedef void call_data_func_t(uintptr_t , uintptr_t); 123 124static u_int logical_cpus; 125 126/* used to hold the AP's until we are ready to release them */ 127static struct mtx ap_boot_mtx; 128 129/* Set to 1 once we're ready to let the APs out of the pen. */ 130static volatile int aps_ready = 0; 131 132/* 133 * Store data from cpu_add() until later in the boot when we actually setup 134 * the APs. 135 */ 136struct cpu_info { 137 int cpu_present:1; 138 int cpu_bsp:1; 139 int cpu_disabled:1; 140} static cpu_info[MAX_APIC_ID + 1]; 141int cpu_apic_ids[MAXCPU]; 142 143/* Holds pending bitmap based IPIs per CPU */ 144static volatile u_int cpu_ipi_pending[MAXCPU]; 145 146static void assign_cpu_ids(void); 147static void set_interrupt_apic_ids(void); 148int start_all_aps(void); 149static int start_ap(int apic_id); 150static void release_aps(void *dummy); 151 152static u_int hyperthreading_cpus; 153static cpumask_t hyperthreading_cpus_mask; 154 155extern void Xhypervisor_callback(void); 156extern void failsafe_callback(void); 157 158struct cpu_group * 159cpu_topo(void) 160{ 161 if (cpu_cores == 0) 162 cpu_cores = 1; 163 if (cpu_logical == 0) 164 cpu_logical = 1; 165 if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { 166 printf("WARNING: Non-uniform processors.\n"); 167 printf("WARNING: Using suboptimal topology.\n"); 168 return (smp_topo_none()); 169 } 170 /* 171 * No multi-core or hyper-threaded. 172 */ 173 if (cpu_logical * cpu_cores == 1) 174 return (smp_topo_none()); 175 /* 176 * Only HTT no multi-core. 177 */ 178 if (cpu_logical > 1 && cpu_cores == 1) 179 return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); 180 /* 181 * Only multi-core no HTT. 182 */ 183 if (cpu_cores > 1 && cpu_logical == 1) 184 return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0)); 185 /* 186 * Both HTT and multi-core. 187 */ 188 return (smp_topo_2level(CG_SHARE_NONE, cpu_cores, 189 CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); 190} 191 192/* 193 * Calculate usable address in base memory for AP trampoline code. 194 */ 195u_int 196mp_bootaddress(u_int basemem) 197{ 198 199 return (basemem); 200} 201 202void 203cpu_add(u_int apic_id, char boot_cpu) 204{ 205 206 if (apic_id > MAX_APIC_ID) { 207 panic("SMP: APIC ID %d too high", apic_id); 208 return; 209 } 210 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", 211 apic_id)); 212 cpu_info[apic_id].cpu_present = 1; 213 if (boot_cpu) { 214 KASSERT(boot_cpu_id == -1, 215 ("CPU %d claims to be BSP, but CPU %d already is", apic_id, 216 boot_cpu_id)); 217 boot_cpu_id = apic_id; 218 cpu_info[apic_id].cpu_bsp = 1; 219 } 220 if (mp_ncpus < MAXCPU) 221 mp_ncpus++; 222 if (bootverbose) 223 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : 224 "AP"); 225} 226 227void 228cpu_mp_setmaxid(void) 229{ 230 231 mp_maxid = MAXCPU - 1; 232} 233 234int 235cpu_mp_probe(void) 236{ 237 238 /* 239 * Always record BSP in CPU map so that the mbuf init code works 240 * correctly. 241 */ 242 all_cpus = 1; 243 if (mp_ncpus == 0) { 244 /* 245 * No CPUs were found, so this must be a UP system. Setup 246 * the variables to represent a system with a single CPU 247 * with an id of 0. 248 */ 249 mp_ncpus = 1; 250 return (0); 251 } 252 253 /* At least one CPU was found. */ 254 if (mp_ncpus == 1) { 255 /* 256 * One CPU was found, so this must be a UP system with 257 * an I/O APIC. 258 */ 259 return (0); 260 } 261 262 /* At least two CPUs were found. */ 263 return (1); 264} 265 266/* 267 * Initialize the IPI handlers and start up the AP's. 268 */ 269void 270cpu_mp_start(void) 271{ 272 int i; 273 274 /* Initialize the logical ID to APIC ID table. */ 275 for (i = 0; i < MAXCPU; i++) { 276 cpu_apic_ids[i] = -1; 277 cpu_ipi_pending[i] = 0; 278 } 279 280 /* Set boot_cpu_id if needed. */ 281 if (boot_cpu_id == -1) { 282 boot_cpu_id = PCPU_GET(apic_id); 283 cpu_info[boot_cpu_id].cpu_bsp = 1; 284 } else 285 KASSERT(boot_cpu_id == PCPU_GET(apic_id), 286 ("BSP's APIC ID doesn't match boot_cpu_id")); 287 cpu_apic_ids[0] = boot_cpu_id; 288 289 assign_cpu_ids(); 290 291 /* Start each Application Processor */ 292 start_all_aps(); 293 294 /* Setup the initial logical CPUs info. */ 295 logical_cpus = logical_cpus_mask = 0; 296 if (cpu_feature & CPUID_HTT) 297 logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16; 298 299 set_interrupt_apic_ids(); 300} 301 302 303static void 304iv_rendezvous(uintptr_t a, uintptr_t b) 305{ 306 307} 308 309static void 310iv_invltlb(uintptr_t a, uintptr_t b) 311{ 312 313} 314 315static void 316iv_invlpg(uintptr_t a, uintptr_t b) 317{ 318 319} 320 321static void 322iv_invlrng(uintptr_t a, uintptr_t b) 323{ 324 325} 326 327static void 328iv_invlcache(uintptr_t a, uintptr_t b) 329{ 330 331} 332 333static void 334iv_lazypmap(uintptr_t a, uintptr_t b) 335{ 336 337} 338 339static void 340iv_bitmap_vector(uintptr_t a, uintptr_t b) 341{ 342 343} 344 345 346static call_data_func_t *ipi_vectors[IPI_BITMAP_VECTOR + 1] = 347{ iv_rendezvous, 348 iv_invltlb, 349 iv_invlpg, 350 iv_invlrng, 351 iv_invlcache, 352 iv_lazypmap, 353 iv_bitmap_vector 354}; 355 356/* 357 * Reschedule call back. Nothing to do, 358 * all the work is done automatically when 359 * we return from the interrupt. 360 */ 361static void 362smp_reschedule_interrupt(void *unused) 363{ 364} 365 366struct _call_data { 367 call_data_func_t *func; 368 uintptr_t arg1; 369 uintptr_t arg2; 370 atomic_t started; 371 atomic_t finished; 372 int wait; 373}; 374 375static struct _call_data *call_data; 376 377static void 378smp_call_function_interrupt(void *unused) 379{ 380 call_data_func_t *func = call_data->func; 381 uintptr_t arg1 = call_data->arg1; 382 uintptr_t arg2 = call_data->arg2; 383 int wait = call_data->wait; 384 385 /* 386 * Notify initiating CPU that I've grabbed the data and am 387 * about to execute the function 388 */ 389 mb(); 390 atomic_inc(&call_data->started); 391 /* 392 * At this point the info structure may be out of scope unless wait==1 393 */ 394 (*func)(arg1, arg2); 395 396 if (wait) { 397 mb(); 398 atomic_inc(&call_data->finished); 399 } 400} 401 402/* 403 * Print various information about the SMP system hardware and setup. 404 */ 405void 406cpu_mp_announce(void) 407{ 408 int i, x; 409 410 /* List CPUs */ 411 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); 412 for (i = 1, x = 0; x <= MAX_APIC_ID; x++) { 413 if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp) 414 continue; 415 if (cpu_info[x].cpu_disabled) 416 printf(" cpu (AP): APIC ID: %2d (disabled)\n", x); 417 else { 418 KASSERT(i < mp_ncpus, 419 ("mp_ncpus and actual cpus are out of whack")); 420 printf(" cpu%d (AP): APIC ID: %2d\n", i++, x); 421 } 422 } 423} 424 425 426static int 427xen_smp_intr_init(unsigned int cpu) 428{ 429 int rc; 430 431 per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1; 432 433 sprintf(resched_name[cpu], "resched%u", cpu); 434 rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR, 435 cpu, 436 resched_name[cpu], 437 smp_reschedule_interrupt, 438 INTR_FAST); 439 440 per_cpu(resched_irq, cpu) = rc; 441 442 sprintf(callfunc_name[cpu], "callfunc%u", cpu); 443 rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR, 444 cpu, 445 callfunc_name[cpu], 446 smp_call_function_interrupt, 447 INTR_FAST); 448 if (rc < 0) 449 goto fail; 450 per_cpu(callfunc_irq, cpu) = rc; 451 452 if ((cpu != 0) && ((rc = ap_cpu_initclocks(cpu)) != 0)) 453 goto fail; 454 455 return 0; 456 457 fail: 458 if (per_cpu(resched_irq, cpu) >= 0) 459 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); 460 if (per_cpu(callfunc_irq, cpu) >= 0) 461 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); 462 return rc; 463} 464 465#define MTOPSIZE (1<<(14 + PAGE_SHIFT)) 466 467/* 468 * AP CPU's call this to initialize themselves. 469 */ 470void 471init_secondary(void) 472{ 473 vm_offset_t addr; 474 int gsel_tss; 475 476 477 /* bootAP is set in start_ap() to our ID. */ 478 PCPU_SET(currentldt, _default_ldt); 479 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 480#if 0 481 gdt[bootAP * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; 482#endif 483 PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ 484 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); 485 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); 486#if 0 487 PCPU_SET(tss_gdt, &gdt[bootAP * NGDT + GPROC0_SEL].sd); 488 489 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); 490#endif 491 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 492 493 /* 494 * Set to a known state: 495 * Set by mpboot.s: CR0_PG, CR0_PE 496 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM 497 */ 498 /* 499 * signal our startup to the BSP. 500 */ 501 mp_naps++; 502 503 /* Spin until the BSP releases the AP's. */ 504 while (!aps_ready) 505 ia32_pause(); 506 507 /* BSP may have changed PTD while we were waiting */ 508 invltlb(); 509 for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE) 510 invlpg(addr); 511 512 /* set up FPU state on the AP */ 513 npxinit(__INITIAL_NPXCW__); 514#if 0 515 516 /* set up SSE registers */ 517 enable_sse(); 518#endif 519#if 0 && defined(PAE) 520 /* Enable the PTE no-execute bit. */ 521 if ((amd_feature & AMDID_NX) != 0) { 522 uint64_t msr; 523 524 msr = rdmsr(MSR_EFER) | EFER_NXE; 525 wrmsr(MSR_EFER, msr); 526 } 527#endif 528#if 0 529 /* A quick check from sanity claus */ 530 if (PCPU_GET(apic_id) != lapic_id()) { 531 printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); 532 printf("SMP: actual apic_id = %d\n", lapic_id()); 533 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 534 panic("cpuid mismatch! boom!!"); 535 } 536#endif 537 538 /* Initialize curthread. */ 539 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 540 PCPU_SET(curthread, PCPU_GET(idlethread)); 541 542 mtx_lock_spin(&ap_boot_mtx); 543#if 0 544 545 /* Init local apic for irq's */ 546 lapic_setup(1); 547#endif 548 smp_cpus++; 549 550 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid)); 551 printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); 552 553 /* Determine if we are a logical CPU. */ 554 if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0) 555 logical_cpus_mask |= PCPU_GET(cpumask); 556 557 /* Determine if we are a hyperthread. */ 558 if (hyperthreading_cpus > 1 && 559 PCPU_GET(apic_id) % hyperthreading_cpus != 0) 560 hyperthreading_cpus_mask |= PCPU_GET(cpumask); 561 562 /* Build our map of 'other' CPUs. */ 563 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); 564#if 0 565 if (bootverbose) 566 lapic_dump("AP"); 567#endif 568 if (smp_cpus == mp_ncpus) { 569 /* enable IPI's, tlb shootdown, freezes etc */ 570 atomic_store_rel_int(&smp_started, 1); 571 smp_active = 1; /* historic */ 572 } 573 574 xen_smp_intr_init(bootAP); 575 mtx_unlock_spin(&ap_boot_mtx); 576 577 /* wait until all the AP's are up */ 578 while (smp_started == 0) 579 ia32_pause(); 580 581 582 PCPU_SET(curthread, PCPU_GET(idlethread)); 583 /* enter the scheduler */ 584 sched_throw(NULL); 585 586 panic("scheduler returned us to %s", __func__); 587 /* NOTREACHED */ 588} 589 590/******************************************************************* 591 * local functions and data 592 */ 593 594/* 595 * We tell the I/O APIC code about all the CPUs we want to receive 596 * interrupts. If we don't want certain CPUs to receive IRQs we 597 * can simply not tell the I/O APIC code about them in this function. 598 * We also do not tell it about the BSP since it tells itself about 599 * the BSP internally to work with UP kernels and on UP machines. 600 */ 601static void 602set_interrupt_apic_ids(void) 603{ 604 u_int i, apic_id; 605 606 for (i = 0; i < MAXCPU; i++) { 607 apic_id = cpu_apic_ids[i]; 608 if (apic_id == -1) 609 continue; 610 if (cpu_info[apic_id].cpu_bsp) 611 continue; 612 if (cpu_info[apic_id].cpu_disabled) 613 continue; 614 615 /* Don't let hyperthreads service interrupts. */ 616 if (hyperthreading_cpus > 1 && 617 apic_id % hyperthreading_cpus != 0) 618 continue; 619 620 intr_add_cpu(i); 621 } 622} 623 624/* 625 * Assign logical CPU IDs to local APICs. 626 */ 627static void 628assign_cpu_ids(void) 629{ 630 u_int i; 631 632 /* Check for explicitly disabled CPUs. */ 633 for (i = 0; i <= MAX_APIC_ID; i++) { 634 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) 635 continue; 636 637 /* Don't use this CPU if it has been disabled by a tunable. */ 638 if (resource_disabled("lapic", i)) { 639 cpu_info[i].cpu_disabled = 1; 640 continue; 641 } 642 } 643 644 /* 645 * Assign CPU IDs to local APIC IDs and disable any CPUs 646 * beyond MAXCPU. CPU 0 has already been assigned to the BSP, 647 * so we only have to assign IDs for APs. 648 */ 649 mp_ncpus = 1; 650 for (i = 0; i <= MAX_APIC_ID; i++) { 651 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || 652 cpu_info[i].cpu_disabled) 653 continue; 654 655 if (mp_ncpus < MAXCPU) { 656 cpu_apic_ids[mp_ncpus] = i; 657 mp_ncpus++; 658 } else 659 cpu_info[i].cpu_disabled = 1; 660 } 661 KASSERT(mp_maxid >= mp_ncpus - 1, 662 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 663 mp_ncpus)); 664} 665 666/* 667 * start each AP in our list 668 */ 669/* Lowest 1MB is already mapped: don't touch*/ 670#define TMPMAP_START 1 671int 672start_all_aps(void) 673{ 674 int x,apic_id, cpu; 675 struct pcpu *pc; 676 677 mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); 678 679 /* set up temporary P==V mapping for AP boot */ 680 /* XXX this is a hack, we should boot the AP on its own stack/PTD */ 681 682 /* start each AP */ 683 for (cpu = 1; cpu < mp_ncpus; cpu++) { 684 apic_id = cpu_apic_ids[cpu]; 685 686 687 bootAP = cpu; 688 bootAPgdt = gdt + (512*cpu); 689 690 /* Get per-cpu data */ 691 pc = &__pcpu[bootAP]; 692 pcpu_init(pc, bootAP, sizeof(struct pcpu)); 693 pc->pc_apic_id = cpu_apic_ids[bootAP]; 694 pc->pc_prvspace = pc; 695 pc->pc_curthread = 0; 696 697 gdt_segs[GPRIV_SEL].ssd_base = (int) pc; 698 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; 699 700 PT_SET_MA(bootAPgdt, xpmap_ptom(VTOP(bootAPgdt)) | PG_V | PG_RW); 701 bzero(bootAPgdt, PAGE_SIZE); 702 for (x = 0; x < NGDT; x++) 703 ssdtosd(&gdt_segs[x], &bootAPgdt[x].sd); 704 PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V); 705#ifdef notyet 706 707 if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) { 708 apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id); 709 acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id); 710#ifdef CONFIG_ACPI 711 if (acpiid != 0xff) 712 x86_acpiid_to_apicid[acpiid] = apicid; 713#endif 714 } 715#endif 716 717 /* attempt to start the Application Processor */ 718 if (!start_ap(cpu)) { 719 printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id); 720 /* better panic as the AP may be running loose */ 721 printf("panic y/n? [y] "); 722 if (cngetc() != 'n') 723 panic("bye-bye"); 724 } 725 726 all_cpus |= (1 << cpu); /* record AP in CPU map */ 727 } 728 729 730 /* build our map of 'other' CPUs */ 731 PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); 732 733 pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); 734 735 /* number of APs actually started */ 736 return mp_naps; 737} 738 739extern uint8_t *pcpu_boot_stack; 740extern trap_info_t trap_table[]; 741 742static void 743smp_trap_init(trap_info_t *trap_ctxt) 744{ 745 const trap_info_t *t = trap_table; 746 747 for (t = trap_table; t->address; t++) { 748 trap_ctxt[t->vector].flags = t->flags; 749 trap_ctxt[t->vector].cs = t->cs; 750 trap_ctxt[t->vector].address = t->address; 751 } 752} 753 754extern int nkpt; 755static void 756cpu_initialize_context(unsigned int cpu) 757{ 758 /* vcpu_guest_context_t is too large to allocate on the stack. 759 * Hence we allocate statically and protect it with a lock */ 760 vm_page_t m[4]; 761 static vcpu_guest_context_t ctxt; 762 vm_offset_t boot_stack; 763 vm_offset_t newPTD; 764 vm_paddr_t ma[NPGPTD]; 765 static int color; 766 int i; 767 768 /* 769 * Page 0,[0-3] PTD 770 * Page 1, [4] boot stack 771 * Page [5] PDPT 772 * 773 */ 774 for (i = 0; i < NPGPTD + 2; i++) { 775 m[i] = vm_page_alloc(NULL, color++, 776 VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 777 VM_ALLOC_ZERO); 778 779 pmap_zero_page(m[i]); 780 781 } 782 boot_stack = kmem_alloc_nofault(kernel_map, 1); 783 newPTD = kmem_alloc_nofault(kernel_map, NPGPTD); 784 ma[0] = xpmap_ptom(VM_PAGE_TO_PHYS(m[0]))|PG_V; 785 786#ifdef PAE 787 pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD + 1])); 788 for (i = 0; i < NPGPTD; i++) { 789 ((vm_paddr_t *)boot_stack)[i] = 790 ma[i] = 791 xpmap_ptom(VM_PAGE_TO_PHYS(m[i]))|PG_V; 792 } 793#endif 794 795 /* 796 * Copy cpu0 IdlePTD to new IdlePTD - copying only 797 * kernel mappings 798 */ 799 pmap_qenter(newPTD, m, 4); 800 801 memcpy((uint8_t *)newPTD + KPTDI*sizeof(vm_paddr_t), 802 (uint8_t *)PTOV(IdlePTD) + KPTDI*sizeof(vm_paddr_t), 803 nkpt*sizeof(vm_paddr_t)); 804 805 pmap_qremove(newPTD, 4); 806 kmem_free(kernel_map, newPTD, 4); 807 /* 808 * map actual idle stack to boot_stack 809 */ 810 pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD])); 811 812 813 xen_pgdpt_pin(xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1]))); 814 vm_page_lock_queues(); 815 for (i = 0; i < 4; i++) { 816 int pdir = (PTDPTDI + i) / NPDEPG; 817 int curoffset = (PTDPTDI + i) % NPDEPG; 818 819 xen_queue_pt_update((vm_paddr_t) 820 ((ma[pdir] & ~PG_V) + (curoffset*sizeof(vm_paddr_t))), 821 ma[i]); 822 } 823 PT_UPDATES_FLUSH(); 824 vm_page_unlock_queues(); 825 826 memset(&ctxt, 0, sizeof(ctxt)); 827 ctxt.flags = VGCF_IN_KERNEL; 828 ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL); 829 ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL); 830 ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL); 831 ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL); 832 ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL); 833 ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL); 834 ctxt.user_regs.eip = (unsigned long)init_secondary; 835 ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */ 836 837 memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); 838 839 smp_trap_init(ctxt.trap_ctxt); 840 841 ctxt.ldt_ents = 0; 842 ctxt.gdt_frames[0] = (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT); 843 ctxt.gdt_ents = 512; 844 845#ifdef __i386__ 846 ctxt.user_regs.esp = boot_stack + PAGE_SIZE; 847 848 ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); 849 ctxt.kernel_sp = boot_stack + PAGE_SIZE; 850 851 ctxt.event_callback_cs = GSEL(GCODE_SEL, SEL_KPL); 852 ctxt.event_callback_eip = (unsigned long)Xhypervisor_callback; 853 ctxt.failsafe_callback_cs = GSEL(GCODE_SEL, SEL_KPL); 854 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; 855 856 ctxt.ctrlreg[3] = xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1])); 857#else /* __x86_64__ */ 858 ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs); 859 ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); 860 ctxt.kernel_sp = idle->thread.rsp0; 861 862 ctxt.event_callback_eip = (unsigned long)hypervisor_callback; 863 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; 864 ctxt.syscall_callback_eip = (unsigned long)system_call; 865 866 ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt)); 867 868 ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu)); 869#endif 870 871 printf("gdtpfn=%lx pdptpfn=%lx\n", 872 ctxt.gdt_frames[0], 873 ctxt.ctrlreg[3] >> PAGE_SHIFT); 874 875 PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt)); 876 DELAY(3000); 877 PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)); 878} 879 880/* 881 * This function starts the AP (application processor) identified 882 * by the APIC ID 'physicalCpu'. It does quite a "song and dance" 883 * to accomplish this. This is necessary because of the nuances 884 * of the different hardware we might encounter. It isn't pretty, 885 * but it seems to work. 886 */ 887 888int cpus; 889static int 890start_ap(int apic_id) 891{ 892 int ms; 893 894 /* used as a watchpoint to signal AP startup */ 895 cpus = mp_naps; 896 897 cpu_initialize_context(apic_id); 898 899 /* Wait up to 5 seconds for it to start. */ 900 for (ms = 0; ms < 5000; ms++) { 901 if (mp_naps > cpus) 902 return 1; /* return SUCCESS */ 903 DELAY(1000); 904 } 905 return 0; /* return FAILURE */ 906} 907 908/* 909 * Flush the TLB on all other CPU's 910 */ 911static void 912smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) 913{ 914 u_int ncpu; 915 916 ncpu = mp_ncpus - 1; /* does not shootdown self */ 917 if (ncpu < 1) 918 return; /* no other cpus */ 919 if (!(read_eflags() & PSL_I)) 920 panic("%s: interrupts disabled", __func__); 921 mtx_lock_spin(&smp_ipi_mtx); 922 call_data->func = ipi_vectors[vector]; 923 call_data->arg1 = addr1; 924 call_data->arg2 = addr2; 925 atomic_store_rel_int(&smp_tlb_wait, 0); 926 ipi_all_but_self(vector); 927 while (smp_tlb_wait < ncpu) 928 ia32_pause(); 929 mtx_unlock_spin(&smp_ipi_mtx); 930} 931 932static void 933smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) 934{ 935 int ncpu, othercpus; 936 937 othercpus = mp_ncpus - 1; 938 if (mask == (u_int)-1) { 939 ncpu = othercpus; 940 if (ncpu < 1) 941 return; 942 } else { 943 mask &= ~PCPU_GET(cpumask); 944 if (mask == 0) 945 return; 946 ncpu = bitcount32(mask); 947 if (ncpu > othercpus) { 948 /* XXX this should be a panic offence */ 949 printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", 950 ncpu, othercpus); 951 ncpu = othercpus; 952 } 953 /* XXX should be a panic, implied by mask == 0 above */ 954 if (ncpu < 1) 955 return; 956 } 957 if (!(read_eflags() & PSL_I)) 958 panic("%s: interrupts disabled", __func__); 959 mtx_lock_spin(&smp_ipi_mtx); 960 smp_tlb_addr1 = addr1; 961 smp_tlb_addr2 = addr2; 962 atomic_store_rel_int(&smp_tlb_wait, 0); 963 if (mask == (u_int)-1) 964 ipi_all_but_self(vector); 965 else 966 ipi_selected(mask, vector); 967 while (smp_tlb_wait < ncpu) 968 ia32_pause(); 969 mtx_unlock_spin(&smp_ipi_mtx); 970} 971 972void 973smp_cache_flush(void) 974{ 975 976 if (smp_started) 977 smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); 978} 979 980void 981smp_invltlb(void) 982{ 983 984 if (smp_started) { 985 smp_tlb_shootdown(IPI_INVLTLB, 0, 0); 986 } 987} 988 989void 990smp_invlpg(vm_offset_t addr) 991{ 992 993 if (smp_started) { 994 smp_tlb_shootdown(IPI_INVLPG, addr, 0); 995 } 996} 997 998void 999smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) 1000{ 1001 1002 if (smp_started) { 1003 smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); 1004 } 1005} 1006 1007void 1008smp_masked_invltlb(u_int mask) 1009{ 1010 1011 if (smp_started) { 1012 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); 1013 } 1014} 1015 1016void 1017smp_masked_invlpg(u_int mask, vm_offset_t addr) 1018{ 1019 1020 if (smp_started) { 1021 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); 1022 } 1023} 1024 1025void 1026smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2) 1027{ 1028 1029 if (smp_started) { 1030 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); 1031 } 1032} 1033 1034void 1035ipi_bitmap_handler(struct trapframe frame) 1036{ 1037 int cpu = PCPU_GET(cpuid); 1038 u_int ipi_bitmap; 1039 1040 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 1041 1042 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 1043 sched_preempt(curthread); 1044 } 1045} 1046 1047/* 1048 * send an IPI to a set of cpus. 1049 */ 1050void 1051ipi_selected(u_int32_t cpus, u_int ipi) 1052{ 1053 int cpu; 1054 u_int bitmap = 0; 1055 u_int old_pending; 1056 u_int new_pending; 1057 1058 if (IPI_IS_BITMAPED(ipi)) { 1059 bitmap = 1 << ipi; 1060 ipi = IPI_BITMAP_VECTOR; 1061 } 1062 1063#ifdef STOP_NMI 1064 if (ipi == IPI_STOP && stop_cpus_with_nmi) { 1065 ipi_nmi_selected(cpus); 1066 return; 1067 } 1068#endif 1069 CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi); 1070 while ((cpu = ffs(cpus)) != 0) { 1071 cpu--; 1072 cpus &= ~(1 << cpu); 1073 1074 KASSERT(cpu_apic_ids[cpu] != -1, 1075 ("IPI to non-existent CPU %d", cpu)); 1076 1077 if (bitmap) { 1078 do { 1079 old_pending = cpu_ipi_pending[cpu]; 1080 new_pending = old_pending | bitmap; 1081 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu],old_pending, new_pending)); 1082 1083 if (old_pending) 1084 continue; 1085 } 1086 1087 ipi_pcpu(cpu, ipi); 1088 } 1089} 1090 1091/* 1092 * send an IPI to all CPUs EXCEPT myself 1093 */ 1094void 1095ipi_all_but_self(u_int ipi) 1096{ 1097 1098 if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) { 1099 ipi_selected(PCPU_GET(other_cpus), ipi); 1100 return; 1101 } 1102 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1103 ipi_selected(((int)-1 & ~(1 << curcpu)), ipi); 1104} 1105 1106#ifdef STOP_NMI 1107/* 1108 * send NMI IPI to selected CPUs 1109 */ 1110 1111#define BEFORE_SPIN 1000000 1112 1113void 1114ipi_nmi_selected(u_int32_t cpus) 1115{ 1116 int cpu; 1117 register_t icrlo; 1118 1119 icrlo = APIC_DELMODE_NMI | APIC_DESTMODE_PHY | APIC_LEVEL_ASSERT 1120 | APIC_TRIGMOD_EDGE; 1121 1122 CTR2(KTR_SMP, "%s: cpus: %x nmi", __func__, cpus); 1123 1124 atomic_set_int(&ipi_nmi_pending, cpus); 1125 1126 while ((cpu = ffs(cpus)) != 0) { 1127 cpu--; 1128 cpus &= ~(1 << cpu); 1129 1130 KASSERT(cpu_apic_ids[cpu] != -1, 1131 ("IPI NMI to non-existent CPU %d", cpu)); 1132 1133 /* Wait for an earlier IPI to finish. */ 1134 if (!lapic_ipi_wait(BEFORE_SPIN)) 1135 panic("ipi_nmi_selected: previous IPI has not cleared"); 1136 1137 lapic_ipi_raw(icrlo, cpu_apic_ids[cpu]); 1138 } 1139} 1140 1141int 1142ipi_nmi_handler(void) 1143{ 1144 int cpumask = PCPU_GET(cpumask); 1145 1146 if (!(ipi_nmi_pending & cpumask)) 1147 return 1; 1148 1149 atomic_clear_int(&ipi_nmi_pending, cpumask); 1150 cpustop_handler(); 1151 return 0; 1152} 1153 1154#endif /* STOP_NMI */ 1155 1156/* 1157 * Handle an IPI_STOP by saving our current context and spinning until we 1158 * are resumed. 1159 */ 1160void 1161cpustop_handler(void) 1162{ 1163 int cpu = PCPU_GET(cpuid); 1164 int cpumask = PCPU_GET(cpumask); 1165 1166 savectx(&stoppcbs[cpu]); 1167 1168 /* Indicate that we are stopped */ 1169 atomic_set_int(&stopped_cpus, cpumask); 1170 1171 /* Wait for restart */ 1172 while (!(started_cpus & cpumask)) 1173 ia32_pause(); 1174 1175 atomic_clear_int(&started_cpus, cpumask); 1176 atomic_clear_int(&stopped_cpus, cpumask); 1177 1178 if (cpu == 0 && cpustop_restartfunc != NULL) { 1179 cpustop_restartfunc(); 1180 cpustop_restartfunc = NULL; 1181 } 1182} 1183 1184/* 1185 * This is called once the rest of the system is up and running and we're 1186 * ready to let the AP's out of the pen. 1187 */ 1188static void 1189release_aps(void *dummy __unused) 1190{ 1191 1192 if (mp_ncpus == 1) 1193 return; 1194 atomic_store_rel_int(&aps_ready, 1); 1195 while (smp_started == 0) 1196 ia32_pause(); 1197} 1198SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1199 1200