mp_machdep.c revision 222813
1/*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2008, by Kip Macy 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: head/sys/i386/xen/mp_machdep.c 222813 2011-06-07 08:46:13Z attilio $"); 29 30#include "opt_apic.h" 31#include "opt_cpu.h" 32#include "opt_kstack_pages.h" 33#include "opt_mp_watchdog.h" 34#include "opt_pmap.h" 35#include "opt_sched.h" 36#include "opt_smp.h" 37 38#if !defined(lint) 39#if !defined(SMP) 40#error How did you get here? 41#endif 42 43#ifndef DEV_APIC 44#error The apic device is required for SMP, add "device apic" to your config file. 45#endif 46#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT) 47#error SMP not supported with CPU_DISABLE_CMPXCHG 48#endif 49#endif /* not lint */ 50 51#include <sys/param.h> 52#include <sys/systm.h> 53#include <sys/bus.h> 54#include <sys/cons.h> /* cngetc() */ 55#include <sys/cpuset.h> 56#ifdef GPROF 57#include <sys/gmon.h> 58#endif 59#include <sys/kernel.h> 60#include <sys/ktr.h> 61#include <sys/lock.h> 62#include <sys/malloc.h> 63#include <sys/memrange.h> 64#include <sys/mutex.h> 65#include <sys/pcpu.h> 66#include <sys/proc.h> 67#include <sys/sched.h> 68#include <sys/smp.h> 69#include <sys/sysctl.h> 70 71#include <vm/vm.h> 72#include <vm/vm_param.h> 73#include <vm/pmap.h> 74#include <vm/vm_kern.h> 75#include <vm/vm_extern.h> 76#include <vm/vm_page.h> 77 78#include <x86/apicreg.h> 79#include <machine/md_var.h> 80#include <machine/mp_watchdog.h> 81#include <machine/pcb.h> 82#include <machine/psl.h> 83#include <machine/smp.h> 84#include <machine/specialreg.h> 85#include <machine/pcpu.h> 86 87 88 89#include <machine/xen/xen-os.h> 90#include <xen/evtchn.h> 91#include <xen/xen_intr.h> 92#include <xen/hypervisor.h> 93#include <xen/interface/vcpu.h> 94 95 96int mp_naps; /* # of Applications processors */ 97int boot_cpu_id = -1; /* designated BSP */ 98 99extern struct pcpu __pcpu[]; 100 101static int bootAP; 102static union descriptor *bootAPgdt; 103 104static char resched_name[NR_CPUS][15]; 105static char callfunc_name[NR_CPUS][15]; 106 107/* Free these after use */ 108void *bootstacks[MAXCPU]; 109 110struct pcb stoppcbs[MAXCPU]; 111 112/* Variables needed for SMP tlb shootdown. */ 113vm_offset_t smp_tlb_addr1; 114vm_offset_t smp_tlb_addr2; 115volatile int smp_tlb_wait; 116 117typedef void call_data_func_t(uintptr_t , uintptr_t); 118 119static u_int logical_cpus; 120static volatile cpuset_t ipi_nmi_pending; 121 122/* used to hold the AP's until we are ready to release them */ 123static struct mtx ap_boot_mtx; 124 125/* Set to 1 once we're ready to let the APs out of the pen. */ 126static volatile int aps_ready = 0; 127 128/* 129 * Store data from cpu_add() until later in the boot when we actually setup 130 * the APs. 131 */ 132struct cpu_info { 133 int cpu_present:1; 134 int cpu_bsp:1; 135 int cpu_disabled:1; 136} static cpu_info[MAX_APIC_ID + 1]; 137int cpu_apic_ids[MAXCPU]; 138int apic_cpuids[MAX_APIC_ID + 1]; 139 140/* Holds pending bitmap based IPIs per CPU */ 141static volatile u_int cpu_ipi_pending[MAXCPU]; 142 143static int cpu_logical; 144static int cpu_cores; 145 146static void assign_cpu_ids(void); 147static void set_interrupt_apic_ids(void); 148int start_all_aps(void); 149static int start_ap(int apic_id); 150static void release_aps(void *dummy); 151 152static u_int hyperthreading_cpus; 153static cpuset_t hyperthreading_cpus_mask; 154 155extern void Xhypervisor_callback(void); 156extern void failsafe_callback(void); 157extern void pmap_lazyfix_action(void); 158 159struct cpu_group * 160cpu_topo(void) 161{ 162 if (cpu_cores == 0) 163 cpu_cores = 1; 164 if (cpu_logical == 0) 165 cpu_logical = 1; 166 if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { 167 printf("WARNING: Non-uniform processors.\n"); 168 printf("WARNING: Using suboptimal topology.\n"); 169 return (smp_topo_none()); 170 } 171 /* 172 * No multi-core or hyper-threaded. 173 */ 174 if (cpu_logical * cpu_cores == 1) 175 return (smp_topo_none()); 176 /* 177 * Only HTT no multi-core. 178 */ 179 if (cpu_logical > 1 && cpu_cores == 1) 180 return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); 181 /* 182 * Only multi-core no HTT. 183 */ 184 if (cpu_cores > 1 && cpu_logical == 1) 185 return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0)); 186 /* 187 * Both HTT and multi-core. 188 */ 189 return (smp_topo_2level(CG_SHARE_NONE, cpu_cores, 190 CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); 191} 192 193/* 194 * Calculate usable address in base memory for AP trampoline code. 195 */ 196u_int 197mp_bootaddress(u_int basemem) 198{ 199 200 return (basemem); 201} 202 203void 204cpu_add(u_int apic_id, char boot_cpu) 205{ 206 207 if (apic_id > MAX_APIC_ID) { 208 panic("SMP: APIC ID %d too high", apic_id); 209 return; 210 } 211 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", 212 apic_id)); 213 cpu_info[apic_id].cpu_present = 1; 214 if (boot_cpu) { 215 KASSERT(boot_cpu_id == -1, 216 ("CPU %d claims to be BSP, but CPU %d already is", apic_id, 217 boot_cpu_id)); 218 boot_cpu_id = apic_id; 219 cpu_info[apic_id].cpu_bsp = 1; 220 } 221 if (mp_ncpus < MAXCPU) 222 mp_ncpus++; 223 if (bootverbose) 224 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : 225 "AP"); 226} 227 228void 229cpu_mp_setmaxid(void) 230{ 231 232 mp_maxid = MAXCPU - 1; 233} 234 235int 236cpu_mp_probe(void) 237{ 238 239 /* 240 * Always record BSP in CPU map so that the mbuf init code works 241 * correctly. 242 */ 243 CPU_SETOF(0, &all_cpus); 244 if (mp_ncpus == 0) { 245 /* 246 * No CPUs were found, so this must be a UP system. Setup 247 * the variables to represent a system with a single CPU 248 * with an id of 0. 249 */ 250 mp_ncpus = 1; 251 return (0); 252 } 253 254 /* At least one CPU was found. */ 255 if (mp_ncpus == 1) { 256 /* 257 * One CPU was found, so this must be a UP system with 258 * an I/O APIC. 259 */ 260 return (0); 261 } 262 263 /* At least two CPUs were found. */ 264 return (1); 265} 266 267/* 268 * Initialize the IPI handlers and start up the AP's. 269 */ 270void 271cpu_mp_start(void) 272{ 273 int i; 274 275 /* Initialize the logical ID to APIC ID table. */ 276 for (i = 0; i < MAXCPU; i++) { 277 cpu_apic_ids[i] = -1; 278 cpu_ipi_pending[i] = 0; 279 } 280 281 /* Set boot_cpu_id if needed. */ 282 if (boot_cpu_id == -1) { 283 boot_cpu_id = PCPU_GET(apic_id); 284 cpu_info[boot_cpu_id].cpu_bsp = 1; 285 } else 286 KASSERT(boot_cpu_id == PCPU_GET(apic_id), 287 ("BSP's APIC ID doesn't match boot_cpu_id")); 288 cpu_apic_ids[0] = boot_cpu_id; 289 apic_cpuids[boot_cpu_id] = 0; 290 291 assign_cpu_ids(); 292 293 /* Start each Application Processor */ 294 start_all_aps(); 295 296 /* Setup the initial logical CPUs info. */ 297 logical_cpus = 0; 298 CPU_ZERO(&logical_cpus_mask); 299 if (cpu_feature & CPUID_HTT) 300 logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16; 301 302 set_interrupt_apic_ids(); 303} 304 305 306static void 307iv_rendezvous(uintptr_t a, uintptr_t b) 308{ 309 smp_rendezvous_action(); 310} 311 312static void 313iv_invltlb(uintptr_t a, uintptr_t b) 314{ 315 xen_tlb_flush(); 316} 317 318static void 319iv_invlpg(uintptr_t a, uintptr_t b) 320{ 321 xen_invlpg(a); 322} 323 324static void 325iv_invlrng(uintptr_t a, uintptr_t b) 326{ 327 vm_offset_t start = (vm_offset_t)a; 328 vm_offset_t end = (vm_offset_t)b; 329 330 while (start < end) { 331 xen_invlpg(start); 332 start += PAGE_SIZE; 333 } 334} 335 336 337static void 338iv_invlcache(uintptr_t a, uintptr_t b) 339{ 340 341 wbinvd(); 342 atomic_add_int(&smp_tlb_wait, 1); 343} 344 345static void 346iv_lazypmap(uintptr_t a, uintptr_t b) 347{ 348 pmap_lazyfix_action(); 349 atomic_add_int(&smp_tlb_wait, 1); 350} 351 352/* 353 * These start from "IPI offset" APIC_IPI_INTS 354 */ 355static call_data_func_t *ipi_vectors[6] = 356{ 357 iv_rendezvous, 358 iv_invltlb, 359 iv_invlpg, 360 iv_invlrng, 361 iv_invlcache, 362 iv_lazypmap, 363}; 364 365/* 366 * Reschedule call back. Nothing to do, 367 * all the work is done automatically when 368 * we return from the interrupt. 369 */ 370static int 371smp_reschedule_interrupt(void *unused) 372{ 373 int cpu = PCPU_GET(cpuid); 374 u_int ipi_bitmap; 375 376 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 377 378 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 379#ifdef COUNT_IPIS 380 (*ipi_preempt_counts[cpu])++; 381#endif 382 sched_preempt(curthread); 383 } 384 385 if (ipi_bitmap & (1 << IPI_AST)) { 386#ifdef COUNT_IPIS 387 (*ipi_ast_counts[cpu])++; 388#endif 389 /* Nothing to do for AST */ 390 } 391 return (FILTER_HANDLED); 392} 393 394struct _call_data { 395 uint16_t func_id; 396 uint16_t wait; 397 uintptr_t arg1; 398 uintptr_t arg2; 399 atomic_t started; 400 atomic_t finished; 401}; 402 403static struct _call_data *call_data; 404 405static int 406smp_call_function_interrupt(void *unused) 407{ 408 call_data_func_t *func; 409 uintptr_t arg1 = call_data->arg1; 410 uintptr_t arg2 = call_data->arg2; 411 int wait = call_data->wait; 412 atomic_t *started = &call_data->started; 413 atomic_t *finished = &call_data->finished; 414 415 /* We only handle function IPIs, not bitmap IPIs */ 416 if (call_data->func_id < APIC_IPI_INTS || call_data->func_id > IPI_BITMAP_VECTOR) 417 panic("invalid function id %u", call_data->func_id); 418 419 func = ipi_vectors[call_data->func_id - APIC_IPI_INTS]; 420 /* 421 * Notify initiating CPU that I've grabbed the data and am 422 * about to execute the function 423 */ 424 mb(); 425 atomic_inc(started); 426 /* 427 * At this point the info structure may be out of scope unless wait==1 428 */ 429 (*func)(arg1, arg2); 430 431 if (wait) { 432 mb(); 433 atomic_inc(finished); 434 } 435 atomic_add_int(&smp_tlb_wait, 1); 436 return (FILTER_HANDLED); 437} 438 439/* 440 * Print various information about the SMP system hardware and setup. 441 */ 442void 443cpu_mp_announce(void) 444{ 445 int i, x; 446 447 /* List CPUs */ 448 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); 449 for (i = 1, x = 0; x <= MAX_APIC_ID; x++) { 450 if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp) 451 continue; 452 if (cpu_info[x].cpu_disabled) 453 printf(" cpu (AP): APIC ID: %2d (disabled)\n", x); 454 else { 455 KASSERT(i < mp_ncpus, 456 ("mp_ncpus and actual cpus are out of whack")); 457 printf(" cpu%d (AP): APIC ID: %2d\n", i++, x); 458 } 459 } 460} 461 462static int 463xen_smp_intr_init(unsigned int cpu) 464{ 465 int rc; 466 unsigned int irq; 467 468 per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1; 469 470 sprintf(resched_name[cpu], "resched%u", cpu); 471 rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR, 472 cpu, 473 resched_name[cpu], 474 smp_reschedule_interrupt, 475 INTR_TYPE_TTY, &irq); 476 477 printf("[XEN] IPI cpu=%d irq=%d vector=RESCHEDULE_VECTOR (%d)\n", 478 cpu, irq, RESCHEDULE_VECTOR); 479 480 per_cpu(resched_irq, cpu) = irq; 481 482 sprintf(callfunc_name[cpu], "callfunc%u", cpu); 483 rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR, 484 cpu, 485 callfunc_name[cpu], 486 smp_call_function_interrupt, 487 INTR_TYPE_TTY, &irq); 488 if (rc < 0) 489 goto fail; 490 per_cpu(callfunc_irq, cpu) = irq; 491 492 printf("[XEN] IPI cpu=%d irq=%d vector=CALL_FUNCTION_VECTOR (%d)\n", 493 cpu, irq, CALL_FUNCTION_VECTOR); 494 495 496 if ((cpu != 0) && ((rc = ap_cpu_initclocks(cpu)) != 0)) 497 goto fail; 498 499 return 0; 500 501 fail: 502 if (per_cpu(resched_irq, cpu) >= 0) 503 unbind_from_irqhandler(per_cpu(resched_irq, cpu)); 504 if (per_cpu(callfunc_irq, cpu) >= 0) 505 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu)); 506 return rc; 507} 508 509static void 510xen_smp_intr_init_cpus(void *unused) 511{ 512 int i; 513 514 for (i = 0; i < mp_ncpus; i++) 515 xen_smp_intr_init(i); 516} 517 518#define MTOPSIZE (1<<(14 + PAGE_SHIFT)) 519 520/* 521 * AP CPU's call this to initialize themselves. 522 */ 523void 524init_secondary(void) 525{ 526 cpuset_t tcpuset, tallcpus; 527 vm_offset_t addr; 528 int gsel_tss; 529 530 531 /* bootAP is set in start_ap() to our ID. */ 532 PCPU_SET(currentldt, _default_ldt); 533 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 534#if 0 535 gdt[bootAP * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; 536#endif 537 PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ 538 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); 539 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); 540#if 0 541 PCPU_SET(tss_gdt, &gdt[bootAP * NGDT + GPROC0_SEL].sd); 542 543 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); 544#endif 545 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 546 547 /* 548 * Set to a known state: 549 * Set by mpboot.s: CR0_PG, CR0_PE 550 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM 551 */ 552 /* 553 * signal our startup to the BSP. 554 */ 555 mp_naps++; 556 557 /* Spin until the BSP releases the AP's. */ 558 while (!aps_ready) 559 ia32_pause(); 560 561 /* BSP may have changed PTD while we were waiting */ 562 invltlb(); 563 for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE) 564 invlpg(addr); 565 566 /* set up FPU state on the AP */ 567 npxinit(); 568#if 0 569 570 /* set up SSE registers */ 571 enable_sse(); 572#endif 573#if 0 && defined(PAE) 574 /* Enable the PTE no-execute bit. */ 575 if ((amd_feature & AMDID_NX) != 0) { 576 uint64_t msr; 577 578 msr = rdmsr(MSR_EFER) | EFER_NXE; 579 wrmsr(MSR_EFER, msr); 580 } 581#endif 582#if 0 583 /* A quick check from sanity claus */ 584 if (PCPU_GET(apic_id) != lapic_id()) { 585 printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); 586 printf("SMP: actual apic_id = %d\n", lapic_id()); 587 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 588 panic("cpuid mismatch! boom!!"); 589 } 590#endif 591 592 /* Initialize curthread. */ 593 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 594 PCPU_SET(curthread, PCPU_GET(idlethread)); 595 596 mtx_lock_spin(&ap_boot_mtx); 597#if 0 598 599 /* Init local apic for irq's */ 600 lapic_setup(1); 601#endif 602 smp_cpus++; 603 604 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid)); 605 printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); 606 tcpuset = PCPU_GET(cpumask); 607 608 /* Determine if we are a logical CPU. */ 609 if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0) 610 CPU_OR(&logical_cpus_mask, &tcpuset); 611 612 /* Determine if we are a hyperthread. */ 613 if (hyperthreading_cpus > 1 && 614 PCPU_GET(apic_id) % hyperthreading_cpus != 0) 615 CPU_OR(&hyperthreading_cpus_mask, &tcpuset); 616 617 /* Build our map of 'other' CPUs. */ 618 tallcpus = all_cpus; 619 CPU_NAND(&tallcpus, &tcpuset); 620 PCPU_SET(other_cpus, tallcpus); 621#if 0 622 if (bootverbose) 623 lapic_dump("AP"); 624#endif 625 if (smp_cpus == mp_ncpus) { 626 /* enable IPI's, tlb shootdown, freezes etc */ 627 atomic_store_rel_int(&smp_started, 1); 628 smp_active = 1; /* historic */ 629 } 630 631 mtx_unlock_spin(&ap_boot_mtx); 632 633 /* wait until all the AP's are up */ 634 while (smp_started == 0) 635 ia32_pause(); 636 637 PCPU_SET(curthread, PCPU_GET(idlethread)); 638 639 /* Start per-CPU event timers. */ 640 cpu_initclocks_ap(); 641 642 /* enter the scheduler */ 643 sched_throw(NULL); 644 645 panic("scheduler returned us to %s", __func__); 646 /* NOTREACHED */ 647} 648 649/******************************************************************* 650 * local functions and data 651 */ 652 653/* 654 * We tell the I/O APIC code about all the CPUs we want to receive 655 * interrupts. If we don't want certain CPUs to receive IRQs we 656 * can simply not tell the I/O APIC code about them in this function. 657 * We also do not tell it about the BSP since it tells itself about 658 * the BSP internally to work with UP kernels and on UP machines. 659 */ 660static void 661set_interrupt_apic_ids(void) 662{ 663 u_int i, apic_id; 664 665 for (i = 0; i < MAXCPU; i++) { 666 apic_id = cpu_apic_ids[i]; 667 if (apic_id == -1) 668 continue; 669 if (cpu_info[apic_id].cpu_bsp) 670 continue; 671 if (cpu_info[apic_id].cpu_disabled) 672 continue; 673 674 /* Don't let hyperthreads service interrupts. */ 675 if (hyperthreading_cpus > 1 && 676 apic_id % hyperthreading_cpus != 0) 677 continue; 678 679 intr_add_cpu(i); 680 } 681} 682 683/* 684 * Assign logical CPU IDs to local APICs. 685 */ 686static void 687assign_cpu_ids(void) 688{ 689 u_int i; 690 691 /* Check for explicitly disabled CPUs. */ 692 for (i = 0; i <= MAX_APIC_ID; i++) { 693 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) 694 continue; 695 696 /* Don't use this CPU if it has been disabled by a tunable. */ 697 if (resource_disabled("lapic", i)) { 698 cpu_info[i].cpu_disabled = 1; 699 continue; 700 } 701 } 702 703 /* 704 * Assign CPU IDs to local APIC IDs and disable any CPUs 705 * beyond MAXCPU. CPU 0 has already been assigned to the BSP, 706 * so we only have to assign IDs for APs. 707 */ 708 mp_ncpus = 1; 709 for (i = 0; i <= MAX_APIC_ID; i++) { 710 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || 711 cpu_info[i].cpu_disabled) 712 continue; 713 714 if (mp_ncpus < MAXCPU) { 715 cpu_apic_ids[mp_ncpus] = i; 716 apic_cpuids[i] = mp_ncpus; 717 mp_ncpus++; 718 } else 719 cpu_info[i].cpu_disabled = 1; 720 } 721 KASSERT(mp_maxid >= mp_ncpus - 1, 722 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 723 mp_ncpus)); 724} 725 726/* 727 * start each AP in our list 728 */ 729/* Lowest 1MB is already mapped: don't touch*/ 730#define TMPMAP_START 1 731int 732start_all_aps(void) 733{ 734 cpuset_t tallcpus; 735 int x,apic_id, cpu; 736 struct pcpu *pc; 737 738 mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); 739 740 /* set up temporary P==V mapping for AP boot */ 741 /* XXX this is a hack, we should boot the AP on its own stack/PTD */ 742 743 /* start each AP */ 744 for (cpu = 1; cpu < mp_ncpus; cpu++) { 745 apic_id = cpu_apic_ids[cpu]; 746 747 748 bootAP = cpu; 749 bootAPgdt = gdt + (512*cpu); 750 751 /* Get per-cpu data */ 752 pc = &__pcpu[bootAP]; 753 pcpu_init(pc, bootAP, sizeof(struct pcpu)); 754 dpcpu_init((void *)kmem_alloc(kernel_map, DPCPU_SIZE), bootAP); 755 pc->pc_apic_id = cpu_apic_ids[bootAP]; 756 pc->pc_prvspace = pc; 757 pc->pc_curthread = 0; 758 759 gdt_segs[GPRIV_SEL].ssd_base = (int) pc; 760 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; 761 762 PT_SET_MA(bootAPgdt, VTOM(bootAPgdt) | PG_V | PG_RW); 763 bzero(bootAPgdt, PAGE_SIZE); 764 for (x = 0; x < NGDT; x++) 765 ssdtosd(&gdt_segs[x], &bootAPgdt[x].sd); 766 PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V); 767#ifdef notyet 768 769 if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) { 770 apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id); 771 acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id); 772#ifdef CONFIG_ACPI 773 if (acpiid != 0xff) 774 x86_acpiid_to_apicid[acpiid] = apicid; 775#endif 776 } 777#endif 778 779 /* attempt to start the Application Processor */ 780 if (!start_ap(cpu)) { 781 printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id); 782 /* better panic as the AP may be running loose */ 783 printf("panic y/n? [y] "); 784 if (cngetc() != 'n') 785 panic("bye-bye"); 786 } 787 788 CPU_SET(cpu, &all_cpus); /* record AP in CPU map */ 789 } 790 791 792 /* build our map of 'other' CPUs */ 793 tallcpus = all_cpus; 794 CPU_NAND(&tallcpus, PCPU_PTR(cpumask)); 795 PCPU_SET(other_cpus, tallcpus); 796 797 pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); 798 799 /* number of APs actually started */ 800 return mp_naps; 801} 802 803extern uint8_t *pcpu_boot_stack; 804extern trap_info_t trap_table[]; 805 806static void 807smp_trap_init(trap_info_t *trap_ctxt) 808{ 809 const trap_info_t *t = trap_table; 810 811 for (t = trap_table; t->address; t++) { 812 trap_ctxt[t->vector].flags = t->flags; 813 trap_ctxt[t->vector].cs = t->cs; 814 trap_ctxt[t->vector].address = t->address; 815 } 816} 817 818extern int nkpt; 819static void 820cpu_initialize_context(unsigned int cpu) 821{ 822 /* vcpu_guest_context_t is too large to allocate on the stack. 823 * Hence we allocate statically and protect it with a lock */ 824 vm_page_t m[4]; 825 static vcpu_guest_context_t ctxt; 826 vm_offset_t boot_stack; 827 vm_offset_t newPTD; 828 vm_paddr_t ma[NPGPTD]; 829 static int color; 830 int i; 831 832 /* 833 * Page 0,[0-3] PTD 834 * Page 1, [4] boot stack 835 * Page [5] PDPT 836 * 837 */ 838 for (i = 0; i < NPGPTD + 2; i++) { 839 m[i] = vm_page_alloc(NULL, color++, 840 VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 841 VM_ALLOC_ZERO); 842 843 pmap_zero_page(m[i]); 844 845 } 846 boot_stack = kmem_alloc_nofault(kernel_map, 1); 847 newPTD = kmem_alloc_nofault(kernel_map, NPGPTD); 848 ma[0] = VM_PAGE_TO_MACH(m[0])|PG_V; 849 850#ifdef PAE 851 pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD + 1])); 852 for (i = 0; i < NPGPTD; i++) { 853 ((vm_paddr_t *)boot_stack)[i] = 854 ma[i] = VM_PAGE_TO_MACH(m[i])|PG_V; 855 } 856#endif 857 858 /* 859 * Copy cpu0 IdlePTD to new IdlePTD - copying only 860 * kernel mappings 861 */ 862 pmap_qenter(newPTD, m, 4); 863 864 memcpy((uint8_t *)newPTD + KPTDI*sizeof(vm_paddr_t), 865 (uint8_t *)PTOV(IdlePTD) + KPTDI*sizeof(vm_paddr_t), 866 nkpt*sizeof(vm_paddr_t)); 867 868 pmap_qremove(newPTD, 4); 869 kmem_free(kernel_map, newPTD, 4); 870 /* 871 * map actual idle stack to boot_stack 872 */ 873 pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD])); 874 875 876 xen_pgdpt_pin(VM_PAGE_TO_MACH(m[NPGPTD + 1])); 877 vm_page_lock_queues(); 878 for (i = 0; i < 4; i++) { 879 int pdir = (PTDPTDI + i) / NPDEPG; 880 int curoffset = (PTDPTDI + i) % NPDEPG; 881 882 xen_queue_pt_update((vm_paddr_t) 883 ((ma[pdir] & ~PG_V) + (curoffset*sizeof(vm_paddr_t))), 884 ma[i]); 885 } 886 PT_UPDATES_FLUSH(); 887 vm_page_unlock_queues(); 888 889 memset(&ctxt, 0, sizeof(ctxt)); 890 ctxt.flags = VGCF_IN_KERNEL; 891 ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL); 892 ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL); 893 ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL); 894 ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL); 895 ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL); 896 ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL); 897 ctxt.user_regs.eip = (unsigned long)init_secondary; 898 ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */ 899 900 memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); 901 902 smp_trap_init(ctxt.trap_ctxt); 903 904 ctxt.ldt_ents = 0; 905 ctxt.gdt_frames[0] = (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT); 906 ctxt.gdt_ents = 512; 907 908#ifdef __i386__ 909 ctxt.user_regs.esp = boot_stack + PAGE_SIZE; 910 911 ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); 912 ctxt.kernel_sp = boot_stack + PAGE_SIZE; 913 914 ctxt.event_callback_cs = GSEL(GCODE_SEL, SEL_KPL); 915 ctxt.event_callback_eip = (unsigned long)Xhypervisor_callback; 916 ctxt.failsafe_callback_cs = GSEL(GCODE_SEL, SEL_KPL); 917 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; 918 919 ctxt.ctrlreg[3] = VM_PAGE_TO_MACH(m[NPGPTD + 1]); 920#else /* __x86_64__ */ 921 ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs); 922 ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); 923 ctxt.kernel_sp = idle->thread.rsp0; 924 925 ctxt.event_callback_eip = (unsigned long)hypervisor_callback; 926 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; 927 ctxt.syscall_callback_eip = (unsigned long)system_call; 928 929 ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt)); 930 931 ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu)); 932#endif 933 934 printf("gdtpfn=%lx pdptpfn=%lx\n", 935 ctxt.gdt_frames[0], 936 ctxt.ctrlreg[3] >> PAGE_SHIFT); 937 938 PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt)); 939 DELAY(3000); 940 PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)); 941} 942 943/* 944 * This function starts the AP (application processor) identified 945 * by the APIC ID 'physicalCpu'. It does quite a "song and dance" 946 * to accomplish this. This is necessary because of the nuances 947 * of the different hardware we might encounter. It isn't pretty, 948 * but it seems to work. 949 */ 950 951int cpus; 952static int 953start_ap(int apic_id) 954{ 955 int ms; 956 957 /* used as a watchpoint to signal AP startup */ 958 cpus = mp_naps; 959 960 cpu_initialize_context(apic_id); 961 962 /* Wait up to 5 seconds for it to start. */ 963 for (ms = 0; ms < 5000; ms++) { 964 if (mp_naps > cpus) 965 return 1; /* return SUCCESS */ 966 DELAY(1000); 967 } 968 return 0; /* return FAILURE */ 969} 970 971/* 972 * send an IPI to a specific CPU. 973 */ 974static void 975ipi_send_cpu(int cpu, u_int ipi) 976{ 977 u_int bitmap, old_pending, new_pending; 978 979 if (IPI_IS_BITMAPED(ipi)) { 980 bitmap = 1 << ipi; 981 ipi = IPI_BITMAP_VECTOR; 982 do { 983 old_pending = cpu_ipi_pending[cpu]; 984 new_pending = old_pending | bitmap; 985 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], 986 old_pending, new_pending)); 987 if (!old_pending) 988 ipi_pcpu(cpu, RESCHEDULE_VECTOR); 989 } else { 990 KASSERT(call_data != NULL, ("call_data not set")); 991 ipi_pcpu(cpu, CALL_FUNCTION_VECTOR); 992 } 993} 994 995/* 996 * Flush the TLB on all other CPU's 997 */ 998static void 999smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) 1000{ 1001 u_int ncpu; 1002 struct _call_data data; 1003 1004 ncpu = mp_ncpus - 1; /* does not shootdown self */ 1005 if (ncpu < 1) 1006 return; /* no other cpus */ 1007 if (!(read_eflags() & PSL_I)) 1008 panic("%s: interrupts disabled", __func__); 1009 mtx_lock_spin(&smp_ipi_mtx); 1010 KASSERT(call_data == NULL, ("call_data isn't null?!")); 1011 call_data = &data; 1012 call_data->func_id = vector; 1013 call_data->arg1 = addr1; 1014 call_data->arg2 = addr2; 1015 atomic_store_rel_int(&smp_tlb_wait, 0); 1016 ipi_all_but_self(vector); 1017 while (smp_tlb_wait < ncpu) 1018 ia32_pause(); 1019 call_data = NULL; 1020 mtx_unlock_spin(&smp_ipi_mtx); 1021} 1022 1023static void 1024smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) 1025{ 1026 int cpu, ncpu, othercpus; 1027 struct _call_data data; 1028 1029 othercpus = mp_ncpus - 1; 1030 if (CPU_ISFULLSET(&mask)) { 1031 if (othercpus < 1) 1032 return; 1033 } else { 1034 critical_enter(); 1035 CPU_NAND(&mask, PCPU_PTR(cpumask)); 1036 critical_exit(); 1037 if (CPU_EMPTY(&mask)) 1038 return; 1039 } 1040 if (!(read_eflags() & PSL_I)) 1041 panic("%s: interrupts disabled", __func__); 1042 mtx_lock_spin(&smp_ipi_mtx); 1043 KASSERT(call_data == NULL, ("call_data isn't null?!")); 1044 call_data = &data; 1045 call_data->func_id = vector; 1046 call_data->arg1 = addr1; 1047 call_data->arg2 = addr2; 1048 atomic_store_rel_int(&smp_tlb_wait, 0); 1049 if (CPU_ISFULLSET(&mask)) { 1050 ncpu = othercpus; 1051 ipi_all_but_self(vector); 1052 } else { 1053 ncpu = 0; 1054 while ((cpu = cpusetobj_ffs(&mask)) != 0) { 1055 cpu--; 1056 CPU_CLR(cpu, &mask); 1057 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, 1058 vector); 1059 ipi_send_cpu(cpu, vector); 1060 ncpu++; 1061 } 1062 } 1063 while (smp_tlb_wait < ncpu) 1064 ia32_pause(); 1065 call_data = NULL; 1066 mtx_unlock_spin(&smp_ipi_mtx); 1067} 1068 1069void 1070smp_cache_flush(void) 1071{ 1072 1073 if (smp_started) 1074 smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); 1075} 1076 1077void 1078smp_invltlb(void) 1079{ 1080 1081 if (smp_started) { 1082 smp_tlb_shootdown(IPI_INVLTLB, 0, 0); 1083 } 1084} 1085 1086void 1087smp_invlpg(vm_offset_t addr) 1088{ 1089 1090 if (smp_started) { 1091 smp_tlb_shootdown(IPI_INVLPG, addr, 0); 1092 } 1093} 1094 1095void 1096smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) 1097{ 1098 1099 if (smp_started) { 1100 smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); 1101 } 1102} 1103 1104void 1105smp_masked_invltlb(cpuset_t mask) 1106{ 1107 1108 if (smp_started) { 1109 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); 1110 } 1111} 1112 1113void 1114smp_masked_invlpg(cpuset_t mask, vm_offset_t addr) 1115{ 1116 1117 if (smp_started) { 1118 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); 1119 } 1120} 1121 1122void 1123smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2) 1124{ 1125 1126 if (smp_started) { 1127 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); 1128 } 1129} 1130 1131/* 1132 * send an IPI to a set of cpus. 1133 */ 1134void 1135ipi_selected(cpuset_t cpus, u_int ipi) 1136{ 1137 int cpu; 1138 1139 /* 1140 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1141 * of help in order to understand what is the source. 1142 * Set the mask of receiving CPUs for this purpose. 1143 */ 1144 if (ipi == IPI_STOP_HARD) 1145 CPU_OR_ATOMIC(&ipi_nmi_pending, &cpus); 1146 1147 while ((cpu = cpusetobj_ffs(&cpus)) != 0) { 1148 cpu--; 1149 CPU_CLR(cpu, &cpus); 1150 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1151 ipi_send_cpu(cpu, ipi); 1152 } 1153} 1154 1155/* 1156 * send an IPI to a specific CPU. 1157 */ 1158void 1159ipi_cpu(int cpu, u_int ipi) 1160{ 1161 1162 /* 1163 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1164 * of help in order to understand what is the source. 1165 * Set the mask of receiving CPUs for this purpose. 1166 */ 1167 if (ipi == IPI_STOP_HARD) 1168 CPU_SET_ATOMIC(cpu, &ipi_nmi_pending); 1169 1170 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1171 ipi_send_cpu(cpu, ipi); 1172} 1173 1174/* 1175 * send an IPI to all CPUs EXCEPT myself 1176 */ 1177void 1178ipi_all_but_self(u_int ipi) 1179{ 1180 cpuset_t other_cpus; 1181 1182 /* 1183 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1184 * of help in order to understand what is the source. 1185 * Set the mask of receiving CPUs for this purpose. 1186 */ 1187 sched_pin(); 1188 other_cpus = PCPU_GET(other_cpus); 1189 sched_unpin(); 1190 if (ipi == IPI_STOP_HARD) 1191 CPU_OR_ATOMIC(&ipi_nmi_pending, &other_cpus); 1192 1193 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1194 ipi_selected(other_cpus, ipi); 1195} 1196 1197int 1198ipi_nmi_handler() 1199{ 1200 cpuset_t cpumask; 1201 1202 /* 1203 * As long as there is not a simple way to know about a NMI's 1204 * source, if the bitmask for the current CPU is present in 1205 * the global pending bitword an IPI_STOP_HARD has been issued 1206 * and should be handled. 1207 */ 1208 sched_pin(); 1209 cpumask = PCPU_GET(cpumask); 1210 sched_unpin(); 1211 if (!CPU_OVERLAP(&ipi_nmi_pending, &cpumask)) 1212 return (1); 1213 1214 CPU_NAND_ATOMIC(&ipi_nmi_pending, &cpumask); 1215 cpustop_handler(); 1216 return (0); 1217} 1218 1219/* 1220 * Handle an IPI_STOP by saving our current context and spinning until we 1221 * are resumed. 1222 */ 1223void 1224cpustop_handler(void) 1225{ 1226 cpuset_t cpumask; 1227 int cpu; 1228 1229 sched_pin(); 1230 cpumask = PCPU_GET(cpumask); 1231 cpu = PCPU_GET(cpuid); 1232 sched_unpin(); 1233 1234 savectx(&stoppcbs[cpu]); 1235 1236 /* Indicate that we are stopped */ 1237 CPU_OR_ATOMIC(&stopped_cpus, &cpumask); 1238 1239 /* Wait for restart */ 1240 while (!CPU_OVERLAP(&started_cpus, &cpumask)) 1241 ia32_pause(); 1242 1243 CPU_NAND_ATOMIC(&started_cpus, &cpumask); 1244 CPU_NAND_ATOMIC(&stopped_cpus, &cpumask); 1245 1246 if (cpu == 0 && cpustop_restartfunc != NULL) { 1247 cpustop_restartfunc(); 1248 cpustop_restartfunc = NULL; 1249 } 1250} 1251 1252/* 1253 * This is called once the rest of the system is up and running and we're 1254 * ready to let the AP's out of the pen. 1255 */ 1256static void 1257release_aps(void *dummy __unused) 1258{ 1259 1260 if (mp_ncpus == 1) 1261 return; 1262 atomic_store_rel_int(&aps_ready, 1); 1263 while (smp_started == 0) 1264 ia32_pause(); 1265} 1266SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1267SYSINIT(start_ipis, SI_SUB_INTR, SI_ORDER_ANY, xen_smp_intr_init_cpus, NULL); 1268 1269