mp_machdep.c revision 264118
1/*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2008, by Kip Macy 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: stable/10/sys/i386/xen/mp_machdep.c 264118 2014-04-04 14:54:54Z royger $"); 29 30#include "opt_apic.h" 31#include "opt_cpu.h" 32#include "opt_kstack_pages.h" 33#include "opt_mp_watchdog.h" 34#include "opt_pmap.h" 35#include "opt_sched.h" 36#include "opt_smp.h" 37 38#if !defined(lint) 39#if !defined(SMP) 40#error How did you get here? 41#endif 42 43#ifndef DEV_APIC 44#error The apic device is required for SMP, add "device apic" to your config file. 45#endif 46#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT) 47#error SMP not supported with CPU_DISABLE_CMPXCHG 48#endif 49#endif /* not lint */ 50 51#include <sys/param.h> 52#include <sys/systm.h> 53#include <sys/bus.h> 54#include <sys/cons.h> /* cngetc() */ 55#include <sys/cpuset.h> 56#ifdef GPROF 57#include <sys/gmon.h> 58#endif 59#include <sys/kernel.h> 60#include <sys/ktr.h> 61#include <sys/lock.h> 62#include <sys/malloc.h> 63#include <sys/memrange.h> 64#include <sys/mutex.h> 65#include <sys/pcpu.h> 66#include <sys/proc.h> 67#include <sys/rwlock.h> 68#include <sys/sched.h> 69#include <sys/smp.h> 70#include <sys/sysctl.h> 71 72#include <vm/vm.h> 73#include <vm/vm_param.h> 74#include <vm/pmap.h> 75#include <vm/vm_kern.h> 76#include <vm/vm_extern.h> 77#include <vm/vm_page.h> 78 79#include <x86/apicreg.h> 80#include <machine/md_var.h> 81#include <machine/mp_watchdog.h> 82#include <machine/pcb.h> 83#include <machine/psl.h> 84#include <machine/smp.h> 85#include <machine/specialreg.h> 86#include <machine/pcpu.h> 87 88#include <xen/xen-os.h> 89#include <xen/evtchn.h> 90#include <xen/xen_intr.h> 91#include <xen/hypervisor.h> 92#include <xen/interface/vcpu.h> 93 94/*---------------------------- Extern Declarations ---------------------------*/ 95extern struct pcpu __pcpu[]; 96 97extern void Xhypervisor_callback(void); 98extern void failsafe_callback(void); 99extern void pmap_lazyfix_action(void); 100 101/*--------------------------- Forward Declarations ---------------------------*/ 102static driver_filter_t smp_reschedule_interrupt; 103static driver_filter_t smp_call_function_interrupt; 104static void assign_cpu_ids(void); 105static void set_interrupt_apic_ids(void); 106static int start_all_aps(void); 107static int start_ap(int apic_id); 108static void release_aps(void *dummy); 109 110/*---------------------------------- Macros ----------------------------------*/ 111#define IPI_TO_IDX(ipi) ((ipi) - APIC_IPI_INTS) 112 113/*-------------------------------- Local Types -------------------------------*/ 114typedef void call_data_func_t(uintptr_t , uintptr_t); 115 116struct cpu_info { 117 int cpu_present:1; 118 int cpu_bsp:1; 119 int cpu_disabled:1; 120}; 121 122struct xen_ipi_handler 123{ 124 driver_filter_t *filter; 125 const char *description; 126}; 127 128enum { 129 RESCHEDULE_VECTOR, 130 CALL_FUNCTION_VECTOR, 131}; 132 133/*-------------------------------- Global Data -------------------------------*/ 134static u_int hyperthreading_cpus; 135static cpuset_t hyperthreading_cpus_mask; 136 137int mp_naps; /* # of Applications processors */ 138int boot_cpu_id = -1; /* designated BSP */ 139 140static int bootAP; 141static union descriptor *bootAPgdt; 142 143/* Free these after use */ 144void *bootstacks[MAXCPU]; 145 146struct pcb stoppcbs[MAXCPU]; 147 148/* Variables needed for SMP tlb shootdown. */ 149vm_offset_t smp_tlb_addr1; 150vm_offset_t smp_tlb_addr2; 151volatile int smp_tlb_wait; 152 153static u_int logical_cpus; 154static volatile cpuset_t ipi_nmi_pending; 155 156/* used to hold the AP's until we are ready to release them */ 157static struct mtx ap_boot_mtx; 158 159/* Set to 1 once we're ready to let the APs out of the pen. */ 160static volatile int aps_ready = 0; 161 162/* 163 * Store data from cpu_add() until later in the boot when we actually setup 164 * the APs. 165 */ 166static struct cpu_info cpu_info[MAX_APIC_ID + 1]; 167int cpu_apic_ids[MAXCPU]; 168int apic_cpuids[MAX_APIC_ID + 1]; 169 170/* Holds pending bitmap based IPIs per CPU */ 171static volatile u_int cpu_ipi_pending[MAXCPU]; 172 173static int cpu_logical; 174static int cpu_cores; 175 176static const struct xen_ipi_handler xen_ipis[] = 177{ 178 [RESCHEDULE_VECTOR] = { smp_reschedule_interrupt, "resched" }, 179 [CALL_FUNCTION_VECTOR] = { smp_call_function_interrupt,"callfunc" } 180}; 181 182/*------------------------------- Per-CPU Data -------------------------------*/ 183DPCPU_DEFINE(xen_intr_handle_t, ipi_handle[nitems(xen_ipis)]); 184DPCPU_DEFINE(struct vcpu_info *, vcpu_info); 185 186/*------------------------------ Implementation ------------------------------*/ 187struct cpu_group * 188cpu_topo(void) 189{ 190 if (cpu_cores == 0) 191 cpu_cores = 1; 192 if (cpu_logical == 0) 193 cpu_logical = 1; 194 if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { 195 printf("WARNING: Non-uniform processors.\n"); 196 printf("WARNING: Using suboptimal topology.\n"); 197 return (smp_topo_none()); 198 } 199 /* 200 * No multi-core or hyper-threaded. 201 */ 202 if (cpu_logical * cpu_cores == 1) 203 return (smp_topo_none()); 204 /* 205 * Only HTT no multi-core. 206 */ 207 if (cpu_logical > 1 && cpu_cores == 1) 208 return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); 209 /* 210 * Only multi-core no HTT. 211 */ 212 if (cpu_cores > 1 && cpu_logical == 1) 213 return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0)); 214 /* 215 * Both HTT and multi-core. 216 */ 217 return (smp_topo_2level(CG_SHARE_NONE, cpu_cores, 218 CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); 219} 220 221/* 222 * Calculate usable address in base memory for AP trampoline code. 223 */ 224u_int 225mp_bootaddress(u_int basemem) 226{ 227 228 return (basemem); 229} 230 231void 232cpu_add(u_int apic_id, char boot_cpu) 233{ 234 235 if (apic_id > MAX_APIC_ID) { 236 panic("SMP: APIC ID %d too high", apic_id); 237 return; 238 } 239 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", 240 apic_id)); 241 cpu_info[apic_id].cpu_present = 1; 242 if (boot_cpu) { 243 KASSERT(boot_cpu_id == -1, 244 ("CPU %d claims to be BSP, but CPU %d already is", apic_id, 245 boot_cpu_id)); 246 boot_cpu_id = apic_id; 247 cpu_info[apic_id].cpu_bsp = 1; 248 } 249 if (mp_ncpus < MAXCPU) 250 mp_ncpus++; 251 if (bootverbose) 252 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : 253 "AP"); 254} 255 256void 257cpu_mp_setmaxid(void) 258{ 259 260 mp_maxid = MAXCPU - 1; 261} 262 263int 264cpu_mp_probe(void) 265{ 266 267 /* 268 * Always record BSP in CPU map so that the mbuf init code works 269 * correctly. 270 */ 271 CPU_SETOF(0, &all_cpus); 272 if (mp_ncpus == 0) { 273 /* 274 * No CPUs were found, so this must be a UP system. Setup 275 * the variables to represent a system with a single CPU 276 * with an id of 0. 277 */ 278 mp_ncpus = 1; 279 return (0); 280 } 281 282 /* At least one CPU was found. */ 283 if (mp_ncpus == 1) { 284 /* 285 * One CPU was found, so this must be a UP system with 286 * an I/O APIC. 287 */ 288 return (0); 289 } 290 291 /* At least two CPUs were found. */ 292 return (1); 293} 294 295/* 296 * Initialize the IPI handlers and start up the AP's. 297 */ 298void 299cpu_mp_start(void) 300{ 301 int i; 302 303 /* Initialize the logical ID to APIC ID table. */ 304 for (i = 0; i < MAXCPU; i++) { 305 cpu_apic_ids[i] = -1; 306 cpu_ipi_pending[i] = 0; 307 } 308 309 /* Set boot_cpu_id if needed. */ 310 if (boot_cpu_id == -1) { 311 boot_cpu_id = PCPU_GET(apic_id); 312 cpu_info[boot_cpu_id].cpu_bsp = 1; 313 } else 314 KASSERT(boot_cpu_id == PCPU_GET(apic_id), 315 ("BSP's APIC ID doesn't match boot_cpu_id")); 316 cpu_apic_ids[0] = boot_cpu_id; 317 apic_cpuids[boot_cpu_id] = 0; 318 319 assign_cpu_ids(); 320 321 /* Start each Application Processor */ 322 start_all_aps(); 323 324 /* Setup the initial logical CPUs info. */ 325 logical_cpus = 0; 326 CPU_ZERO(&logical_cpus_mask); 327 if (cpu_feature & CPUID_HTT) 328 logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16; 329 330 set_interrupt_apic_ids(); 331} 332 333 334static void 335iv_rendezvous(uintptr_t a, uintptr_t b) 336{ 337 smp_rendezvous_action(); 338} 339 340static void 341iv_invltlb(uintptr_t a, uintptr_t b) 342{ 343 xen_tlb_flush(); 344} 345 346static void 347iv_invlpg(uintptr_t a, uintptr_t b) 348{ 349 xen_invlpg(a); 350} 351 352static void 353iv_invlrng(uintptr_t a, uintptr_t b) 354{ 355 vm_offset_t start = (vm_offset_t)a; 356 vm_offset_t end = (vm_offset_t)b; 357 358 while (start < end) { 359 xen_invlpg(start); 360 start += PAGE_SIZE; 361 } 362} 363 364 365static void 366iv_invlcache(uintptr_t a, uintptr_t b) 367{ 368 369 wbinvd(); 370 atomic_add_int(&smp_tlb_wait, 1); 371} 372 373static void 374iv_lazypmap(uintptr_t a, uintptr_t b) 375{ 376 pmap_lazyfix_action(); 377 atomic_add_int(&smp_tlb_wait, 1); 378} 379 380/* 381 * These start from "IPI offset" APIC_IPI_INTS 382 */ 383static call_data_func_t *ipi_vectors[6] = 384{ 385 iv_rendezvous, 386 iv_invltlb, 387 iv_invlpg, 388 iv_invlrng, 389 iv_invlcache, 390 iv_lazypmap, 391}; 392 393/* 394 * Reschedule call back. Nothing to do, 395 * all the work is done automatically when 396 * we return from the interrupt. 397 */ 398static int 399smp_reschedule_interrupt(void *unused) 400{ 401 int cpu = PCPU_GET(cpuid); 402 u_int ipi_bitmap; 403 404 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 405 406 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 407#ifdef COUNT_IPIS 408 (*ipi_preempt_counts[cpu])++; 409#endif 410 sched_preempt(curthread); 411 } 412 413 if (ipi_bitmap & (1 << IPI_AST)) { 414#ifdef COUNT_IPIS 415 (*ipi_ast_counts[cpu])++; 416#endif 417 /* Nothing to do for AST */ 418 } 419 return (FILTER_HANDLED); 420} 421 422struct _call_data { 423 uint16_t func_id; 424 uint16_t wait; 425 uintptr_t arg1; 426 uintptr_t arg2; 427 atomic_t started; 428 atomic_t finished; 429}; 430 431static struct _call_data *call_data; 432 433static int 434smp_call_function_interrupt(void *unused) 435{ 436 call_data_func_t *func; 437 uintptr_t arg1 = call_data->arg1; 438 uintptr_t arg2 = call_data->arg2; 439 int wait = call_data->wait; 440 atomic_t *started = &call_data->started; 441 atomic_t *finished = &call_data->finished; 442 443 /* We only handle function IPIs, not bitmap IPIs */ 444 if (call_data->func_id < APIC_IPI_INTS || 445 call_data->func_id > IPI_BITMAP_VECTOR) 446 panic("invalid function id %u", call_data->func_id); 447 448 func = ipi_vectors[IPI_TO_IDX(call_data->func_id)]; 449 /* 450 * Notify initiating CPU that I've grabbed the data and am 451 * about to execute the function 452 */ 453 mb(); 454 atomic_inc(started); 455 /* 456 * At this point the info structure may be out of scope unless wait==1 457 */ 458 (*func)(arg1, arg2); 459 460 if (wait) { 461 mb(); 462 atomic_inc(finished); 463 } 464 atomic_add_int(&smp_tlb_wait, 1); 465 return (FILTER_HANDLED); 466} 467 468/* 469 * Print various information about the SMP system hardware and setup. 470 */ 471void 472cpu_mp_announce(void) 473{ 474 int i, x; 475 476 /* List CPUs */ 477 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); 478 for (i = 1, x = 0; x <= MAX_APIC_ID; x++) { 479 if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp) 480 continue; 481 if (cpu_info[x].cpu_disabled) 482 printf(" cpu (AP): APIC ID: %2d (disabled)\n", x); 483 else { 484 KASSERT(i < mp_ncpus, 485 ("mp_ncpus and actual cpus are out of whack")); 486 printf(" cpu%d (AP): APIC ID: %2d\n", i++, x); 487 } 488 } 489} 490 491static int 492xen_smp_cpu_init(unsigned int cpu) 493{ 494 xen_intr_handle_t *ipi_handle; 495 const struct xen_ipi_handler *ipi; 496 int idx, rc; 497 498 ipi_handle = DPCPU_ID_GET(cpu, ipi_handle); 499 for (ipi = xen_ipis, idx = 0; idx < nitems(xen_ipis); ipi++, idx++) { 500 501 /* 502 * The PCPU variable pc_device is not initialized on i386 PV, 503 * so we have to use the root_bus device in order to setup 504 * the IPIs. 505 */ 506 rc = xen_intr_alloc_and_bind_ipi(root_bus, cpu, 507 ipi->filter, INTR_TYPE_TTY, &ipi_handle[idx]); 508 if (rc != 0) { 509 printf("Unable to allocate a XEN IPI port. " 510 "Error %d\n", rc); 511 break; 512 } 513 xen_intr_describe(ipi_handle[idx], "%s", ipi->description); 514 } 515 516 for (;idx < nitems(xen_ipis); idx++) 517 ipi_handle[idx] = NULL; 518 519 if (rc == 0) 520 return (0); 521 522 /* Either all are successfully mapped, or none at all. */ 523 for (idx = 0; idx < nitems(xen_ipis); idx++) { 524 if (ipi_handle[idx] == NULL) 525 continue; 526 527 xen_intr_unbind(ipi_handle[idx]); 528 ipi_handle[idx] = NULL; 529 } 530 531 return (rc); 532} 533 534static void 535xen_smp_intr_init_cpus(void *unused) 536{ 537 int i; 538 539 for (i = 0; i < mp_ncpus; i++) 540 xen_smp_cpu_init(i); 541} 542 543static void 544xen_smp_intr_setup_cpus(void *unused) 545{ 546 int i; 547 548 for (i = 0; i < mp_ncpus; i++) 549 DPCPU_ID_SET(i, vcpu_info, 550 &HYPERVISOR_shared_info->vcpu_info[i]); 551} 552 553#define MTOPSIZE (1<<(14 + PAGE_SHIFT)) 554 555/* 556 * AP CPU's call this to initialize themselves. 557 */ 558void 559init_secondary(void) 560{ 561 vm_offset_t addr; 562 u_int cpuid; 563 int gsel_tss; 564 565 566 /* bootAP is set in start_ap() to our ID. */ 567 PCPU_SET(currentldt, _default_ldt); 568 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 569#if 0 570 gdt[bootAP * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; 571#endif 572 PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ 573 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); 574 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); 575#if 0 576 PCPU_SET(tss_gdt, &gdt[bootAP * NGDT + GPROC0_SEL].sd); 577 578 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); 579#endif 580 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 581 582 /* 583 * Set to a known state: 584 * Set by mpboot.s: CR0_PG, CR0_PE 585 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM 586 */ 587 /* 588 * signal our startup to the BSP. 589 */ 590 mp_naps++; 591 592 /* Spin until the BSP releases the AP's. */ 593 while (!aps_ready) 594 ia32_pause(); 595 596 /* BSP may have changed PTD while we were waiting */ 597 invltlb(); 598 for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE) 599 invlpg(addr); 600 601 /* set up FPU state on the AP */ 602 npxinit(); 603#if 0 604 605 /* set up SSE registers */ 606 enable_sse(); 607#endif 608#if 0 && defined(PAE) 609 /* Enable the PTE no-execute bit. */ 610 if ((amd_feature & AMDID_NX) != 0) { 611 uint64_t msr; 612 613 msr = rdmsr(MSR_EFER) | EFER_NXE; 614 wrmsr(MSR_EFER, msr); 615 } 616#endif 617#if 0 618 /* A quick check from sanity claus */ 619 if (PCPU_GET(apic_id) != lapic_id()) { 620 printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); 621 printf("SMP: actual apic_id = %d\n", lapic_id()); 622 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 623 panic("cpuid mismatch! boom!!"); 624 } 625#endif 626 627 /* Initialize curthread. */ 628 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 629 PCPU_SET(curthread, PCPU_GET(idlethread)); 630 631 mtx_lock_spin(&ap_boot_mtx); 632#if 0 633 634 /* Init local apic for irq's */ 635 lapic_setup(1); 636#endif 637 smp_cpus++; 638 639 cpuid = PCPU_GET(cpuid); 640 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); 641 printf("SMP: AP CPU #%d Launched!\n", cpuid); 642 643 /* Determine if we are a logical CPU. */ 644 if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0) 645 CPU_SET(cpuid, &logical_cpus_mask); 646 647 /* Determine if we are a hyperthread. */ 648 if (hyperthreading_cpus > 1 && 649 PCPU_GET(apic_id) % hyperthreading_cpus != 0) 650 CPU_SET(cpuid, &hyperthreading_cpus_mask); 651#if 0 652 if (bootverbose) 653 lapic_dump("AP"); 654#endif 655 if (smp_cpus == mp_ncpus) { 656 /* enable IPI's, tlb shootdown, freezes etc */ 657 atomic_store_rel_int(&smp_started, 1); 658 smp_active = 1; /* historic */ 659 } 660 661 mtx_unlock_spin(&ap_boot_mtx); 662 663 /* wait until all the AP's are up */ 664 while (smp_started == 0) 665 ia32_pause(); 666 667 PCPU_SET(curthread, PCPU_GET(idlethread)); 668 669 /* Start per-CPU event timers. */ 670 cpu_initclocks_ap(); 671 672 /* enter the scheduler */ 673 sched_throw(NULL); 674 675 panic("scheduler returned us to %s", __func__); 676 /* NOTREACHED */ 677} 678 679/******************************************************************* 680 * local functions and data 681 */ 682 683/* 684 * We tell the I/O APIC code about all the CPUs we want to receive 685 * interrupts. If we don't want certain CPUs to receive IRQs we 686 * can simply not tell the I/O APIC code about them in this function. 687 * We also do not tell it about the BSP since it tells itself about 688 * the BSP internally to work with UP kernels and on UP machines. 689 */ 690static void 691set_interrupt_apic_ids(void) 692{ 693 u_int i, apic_id; 694 695 for (i = 0; i < MAXCPU; i++) { 696 apic_id = cpu_apic_ids[i]; 697 if (apic_id == -1) 698 continue; 699 if (cpu_info[apic_id].cpu_bsp) 700 continue; 701 if (cpu_info[apic_id].cpu_disabled) 702 continue; 703 704 /* Don't let hyperthreads service interrupts. */ 705 if (hyperthreading_cpus > 1 && 706 apic_id % hyperthreading_cpus != 0) 707 continue; 708 709 intr_add_cpu(i); 710 } 711} 712 713/* 714 * Assign logical CPU IDs to local APICs. 715 */ 716static void 717assign_cpu_ids(void) 718{ 719 u_int i; 720 721 /* Check for explicitly disabled CPUs. */ 722 for (i = 0; i <= MAX_APIC_ID; i++) { 723 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) 724 continue; 725 726 /* Don't use this CPU if it has been disabled by a tunable. */ 727 if (resource_disabled("lapic", i)) { 728 cpu_info[i].cpu_disabled = 1; 729 continue; 730 } 731 } 732 733 /* 734 * Assign CPU IDs to local APIC IDs and disable any CPUs 735 * beyond MAXCPU. CPU 0 has already been assigned to the BSP, 736 * so we only have to assign IDs for APs. 737 */ 738 mp_ncpus = 1; 739 for (i = 0; i <= MAX_APIC_ID; i++) { 740 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || 741 cpu_info[i].cpu_disabled) 742 continue; 743 744 if (mp_ncpus < MAXCPU) { 745 cpu_apic_ids[mp_ncpus] = i; 746 apic_cpuids[i] = mp_ncpus; 747 mp_ncpus++; 748 } else 749 cpu_info[i].cpu_disabled = 1; 750 } 751 KASSERT(mp_maxid >= mp_ncpus - 1, 752 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 753 mp_ncpus)); 754} 755 756/* 757 * start each AP in our list 758 */ 759/* Lowest 1MB is already mapped: don't touch*/ 760#define TMPMAP_START 1 761int 762start_all_aps(void) 763{ 764 int x,apic_id, cpu; 765 struct pcpu *pc; 766 767 mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); 768 769 /* set up temporary P==V mapping for AP boot */ 770 /* XXX this is a hack, we should boot the AP on its own stack/PTD */ 771 772 /* start each AP */ 773 for (cpu = 1; cpu < mp_ncpus; cpu++) { 774 apic_id = cpu_apic_ids[cpu]; 775 776 777 bootAP = cpu; 778 bootAPgdt = gdt + (512*cpu); 779 780 /* Get per-cpu data */ 781 pc = &__pcpu[bootAP]; 782 pcpu_init(pc, bootAP, sizeof(struct pcpu)); 783 dpcpu_init((void *)kmem_malloc(kernel_arena, DPCPU_SIZE, 784 M_WAITOK | M_ZERO), bootAP); 785 pc->pc_apic_id = cpu_apic_ids[bootAP]; 786 pc->pc_vcpu_id = cpu_apic_ids[bootAP]; 787 pc->pc_prvspace = pc; 788 pc->pc_curthread = 0; 789 790 gdt_segs[GPRIV_SEL].ssd_base = (int) pc; 791 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; 792 793 PT_SET_MA(bootAPgdt, VTOM(bootAPgdt) | PG_V | PG_RW); 794 bzero(bootAPgdt, PAGE_SIZE); 795 for (x = 0; x < NGDT; x++) 796 ssdtosd(&gdt_segs[x], &bootAPgdt[x].sd); 797 PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V); 798#ifdef notyet 799 800 if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) { 801 apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id); 802 acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id); 803#ifdef CONFIG_ACPI 804 if (acpiid != 0xff) 805 x86_acpiid_to_apicid[acpiid] = apicid; 806#endif 807 } 808#endif 809 810 /* attempt to start the Application Processor */ 811 if (!start_ap(cpu)) { 812 printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id); 813 /* better panic as the AP may be running loose */ 814 printf("panic y/n? [y] "); 815 if (cngetc() != 'n') 816 panic("bye-bye"); 817 } 818 819 CPU_SET(cpu, &all_cpus); /* record AP in CPU map */ 820 } 821 822 823 pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); 824 825 /* number of APs actually started */ 826 return (mp_naps); 827} 828 829extern uint8_t *pcpu_boot_stack; 830extern trap_info_t trap_table[]; 831 832static void 833smp_trap_init(trap_info_t *trap_ctxt) 834{ 835 const trap_info_t *t = trap_table; 836 837 for (t = trap_table; t->address; t++) { 838 trap_ctxt[t->vector].flags = t->flags; 839 trap_ctxt[t->vector].cs = t->cs; 840 trap_ctxt[t->vector].address = t->address; 841 } 842} 843 844extern struct rwlock pvh_global_lock; 845extern int nkpt; 846static void 847cpu_initialize_context(unsigned int cpu) 848{ 849 /* vcpu_guest_context_t is too large to allocate on the stack. 850 * Hence we allocate statically and protect it with a lock */ 851 vm_page_t m[NPGPTD + 2]; 852 static vcpu_guest_context_t ctxt; 853 vm_offset_t boot_stack; 854 vm_offset_t newPTD; 855 vm_paddr_t ma[NPGPTD]; 856 int i; 857 858 /* 859 * Page 0,[0-3] PTD 860 * Page 1, [4] boot stack 861 * Page [5] PDPT 862 * 863 */ 864 for (i = 0; i < NPGPTD + 2; i++) { 865 m[i] = vm_page_alloc(NULL, 0, 866 VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 867 VM_ALLOC_ZERO); 868 869 pmap_zero_page(m[i]); 870 871 } 872 boot_stack = kva_alloc(PAGE_SIZE); 873 newPTD = kva_alloc(NPGPTD * PAGE_SIZE); 874 ma[0] = VM_PAGE_TO_MACH(m[0])|PG_V; 875 876#ifdef PAE 877 pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD + 1])); 878 for (i = 0; i < NPGPTD; i++) { 879 ((vm_paddr_t *)boot_stack)[i] = 880 ma[i] = VM_PAGE_TO_MACH(m[i])|PG_V; 881 } 882#endif 883 884 /* 885 * Copy cpu0 IdlePTD to new IdlePTD - copying only 886 * kernel mappings 887 */ 888 pmap_qenter(newPTD, m, 4); 889 890 memcpy((uint8_t *)newPTD + KPTDI*sizeof(vm_paddr_t), 891 (uint8_t *)PTOV(IdlePTD) + KPTDI*sizeof(vm_paddr_t), 892 nkpt*sizeof(vm_paddr_t)); 893 894 pmap_qremove(newPTD, 4); 895 kva_free(newPTD, 4 * PAGE_SIZE); 896 /* 897 * map actual idle stack to boot_stack 898 */ 899 pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD])); 900 901 902 xen_pgdpt_pin(VM_PAGE_TO_MACH(m[NPGPTD + 1])); 903 rw_wlock(&pvh_global_lock); 904 for (i = 0; i < 4; i++) { 905 int pdir = (PTDPTDI + i) / NPDEPG; 906 int curoffset = (PTDPTDI + i) % NPDEPG; 907 908 xen_queue_pt_update((vm_paddr_t) 909 ((ma[pdir] & ~PG_V) + (curoffset*sizeof(vm_paddr_t))), 910 ma[i]); 911 } 912 PT_UPDATES_FLUSH(); 913 rw_wunlock(&pvh_global_lock); 914 915 memset(&ctxt, 0, sizeof(ctxt)); 916 ctxt.flags = VGCF_IN_KERNEL; 917 ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL); 918 ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL); 919 ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL); 920 ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL); 921 ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL); 922 ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL); 923 ctxt.user_regs.eip = (unsigned long)init_secondary; 924 ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */ 925 926 memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); 927 928 smp_trap_init(ctxt.trap_ctxt); 929 930 ctxt.ldt_ents = 0; 931 ctxt.gdt_frames[0] = 932 (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT); 933 ctxt.gdt_ents = 512; 934 935#ifdef __i386__ 936 ctxt.user_regs.esp = boot_stack + PAGE_SIZE; 937 938 ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); 939 ctxt.kernel_sp = boot_stack + PAGE_SIZE; 940 941 ctxt.event_callback_cs = GSEL(GCODE_SEL, SEL_KPL); 942 ctxt.event_callback_eip = (unsigned long)Xhypervisor_callback; 943 ctxt.failsafe_callback_cs = GSEL(GCODE_SEL, SEL_KPL); 944 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; 945 946 ctxt.ctrlreg[3] = VM_PAGE_TO_MACH(m[NPGPTD + 1]); 947#else /* __x86_64__ */ 948 ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs); 949 ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); 950 ctxt.kernel_sp = idle->thread.rsp0; 951 952 ctxt.event_callback_eip = (unsigned long)hypervisor_callback; 953 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; 954 ctxt.syscall_callback_eip = (unsigned long)system_call; 955 956 ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt)); 957 958 ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu)); 959#endif 960 961 printf("gdtpfn=%lx pdptpfn=%lx\n", 962 ctxt.gdt_frames[0], 963 ctxt.ctrlreg[3] >> PAGE_SHIFT); 964 965 PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt)); 966 DELAY(3000); 967 PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)); 968} 969 970/* 971 * This function starts the AP (application processor) identified 972 * by the APIC ID 'physicalCpu'. It does quite a "song and dance" 973 * to accomplish this. This is necessary because of the nuances 974 * of the different hardware we might encounter. It isn't pretty, 975 * but it seems to work. 976 */ 977 978int cpus; 979static int 980start_ap(int apic_id) 981{ 982 int ms; 983 984 /* used as a watchpoint to signal AP startup */ 985 cpus = mp_naps; 986 987 cpu_initialize_context(apic_id); 988 989 /* Wait up to 5 seconds for it to start. */ 990 for (ms = 0; ms < 5000; ms++) { 991 if (mp_naps > cpus) 992 return (1); /* return SUCCESS */ 993 DELAY(1000); 994 } 995 return (0); /* return FAILURE */ 996} 997 998static void 999ipi_pcpu(int cpu, u_int ipi) 1000{ 1001 KASSERT((ipi <= nitems(xen_ipis)), ("invalid IPI")); 1002 xen_intr_signal(DPCPU_ID_GET(cpu, ipi_handle[ipi])); 1003} 1004 1005/* 1006 * send an IPI to a specific CPU. 1007 */ 1008static void 1009ipi_send_cpu(int cpu, u_int ipi) 1010{ 1011 u_int bitmap, old_pending, new_pending; 1012 1013 if (IPI_IS_BITMAPED(ipi)) { 1014 bitmap = 1 << ipi; 1015 ipi = IPI_BITMAP_VECTOR; 1016 do { 1017 old_pending = cpu_ipi_pending[cpu]; 1018 new_pending = old_pending | bitmap; 1019 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], 1020 old_pending, new_pending)); 1021 if (!old_pending) 1022 ipi_pcpu(cpu, RESCHEDULE_VECTOR); 1023 } else { 1024 KASSERT(call_data != NULL, ("call_data not set")); 1025 ipi_pcpu(cpu, CALL_FUNCTION_VECTOR); 1026 } 1027} 1028 1029/* 1030 * Flush the TLB on all other CPU's 1031 */ 1032static void 1033smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) 1034{ 1035 u_int ncpu; 1036 struct _call_data data; 1037 1038 ncpu = mp_ncpus - 1; /* does not shootdown self */ 1039 if (ncpu < 1) 1040 return; /* no other cpus */ 1041 if (!(read_eflags() & PSL_I)) 1042 panic("%s: interrupts disabled", __func__); 1043 mtx_lock_spin(&smp_ipi_mtx); 1044 KASSERT(call_data == NULL, ("call_data isn't null?!")); 1045 call_data = &data; 1046 call_data->func_id = vector; 1047 call_data->arg1 = addr1; 1048 call_data->arg2 = addr2; 1049 atomic_store_rel_int(&smp_tlb_wait, 0); 1050 ipi_all_but_self(vector); 1051 while (smp_tlb_wait < ncpu) 1052 ia32_pause(); 1053 call_data = NULL; 1054 mtx_unlock_spin(&smp_ipi_mtx); 1055} 1056 1057static void 1058smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, 1059 vm_offset_t addr2) 1060{ 1061 int cpu, ncpu, othercpus; 1062 struct _call_data data; 1063 1064 othercpus = mp_ncpus - 1; 1065 if (CPU_ISFULLSET(&mask)) { 1066 if (othercpus < 1) 1067 return; 1068 } else { 1069 CPU_CLR(PCPU_GET(cpuid), &mask); 1070 if (CPU_EMPTY(&mask)) 1071 return; 1072 } 1073 if (!(read_eflags() & PSL_I)) 1074 panic("%s: interrupts disabled", __func__); 1075 mtx_lock_spin(&smp_ipi_mtx); 1076 KASSERT(call_data == NULL, ("call_data isn't null?!")); 1077 call_data = &data; 1078 call_data->func_id = vector; 1079 call_data->arg1 = addr1; 1080 call_data->arg2 = addr2; 1081 atomic_store_rel_int(&smp_tlb_wait, 0); 1082 if (CPU_ISFULLSET(&mask)) { 1083 ncpu = othercpus; 1084 ipi_all_but_self(vector); 1085 } else { 1086 ncpu = 0; 1087 while ((cpu = CPU_FFS(&mask)) != 0) { 1088 cpu--; 1089 CPU_CLR(cpu, &mask); 1090 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, 1091 vector); 1092 ipi_send_cpu(cpu, vector); 1093 ncpu++; 1094 } 1095 } 1096 while (smp_tlb_wait < ncpu) 1097 ia32_pause(); 1098 call_data = NULL; 1099 mtx_unlock_spin(&smp_ipi_mtx); 1100} 1101 1102void 1103smp_cache_flush(void) 1104{ 1105 1106 if (smp_started) 1107 smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); 1108} 1109 1110void 1111smp_invltlb(void) 1112{ 1113 1114 if (smp_started) { 1115 smp_tlb_shootdown(IPI_INVLTLB, 0, 0); 1116 } 1117} 1118 1119void 1120smp_invlpg(vm_offset_t addr) 1121{ 1122 1123 if (smp_started) { 1124 smp_tlb_shootdown(IPI_INVLPG, addr, 0); 1125 } 1126} 1127 1128void 1129smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) 1130{ 1131 1132 if (smp_started) { 1133 smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); 1134 } 1135} 1136 1137void 1138smp_masked_invltlb(cpuset_t mask) 1139{ 1140 1141 if (smp_started) { 1142 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); 1143 } 1144} 1145 1146void 1147smp_masked_invlpg(cpuset_t mask, vm_offset_t addr) 1148{ 1149 1150 if (smp_started) { 1151 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); 1152 } 1153} 1154 1155void 1156smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2) 1157{ 1158 1159 if (smp_started) { 1160 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); 1161 } 1162} 1163 1164/* 1165 * send an IPI to a set of cpus. 1166 */ 1167void 1168ipi_selected(cpuset_t cpus, u_int ipi) 1169{ 1170 int cpu; 1171 1172 /* 1173 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1174 * of help in order to understand what is the source. 1175 * Set the mask of receiving CPUs for this purpose. 1176 */ 1177 if (ipi == IPI_STOP_HARD) 1178 CPU_OR_ATOMIC(&ipi_nmi_pending, &cpus); 1179 1180 while ((cpu = CPU_FFS(&cpus)) != 0) { 1181 cpu--; 1182 CPU_CLR(cpu, &cpus); 1183 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1184 ipi_send_cpu(cpu, ipi); 1185 } 1186} 1187 1188/* 1189 * send an IPI to a specific CPU. 1190 */ 1191void 1192ipi_cpu(int cpu, u_int ipi) 1193{ 1194 1195 /* 1196 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1197 * of help in order to understand what is the source. 1198 * Set the mask of receiving CPUs for this purpose. 1199 */ 1200 if (ipi == IPI_STOP_HARD) 1201 CPU_SET_ATOMIC(cpu, &ipi_nmi_pending); 1202 1203 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1204 ipi_send_cpu(cpu, ipi); 1205} 1206 1207/* 1208 * send an IPI to all CPUs EXCEPT myself 1209 */ 1210void 1211ipi_all_but_self(u_int ipi) 1212{ 1213 cpuset_t other_cpus; 1214 1215 /* 1216 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1217 * of help in order to understand what is the source. 1218 * Set the mask of receiving CPUs for this purpose. 1219 */ 1220 other_cpus = all_cpus; 1221 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1222 if (ipi == IPI_STOP_HARD) 1223 CPU_OR_ATOMIC(&ipi_nmi_pending, &other_cpus); 1224 1225 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1226 ipi_selected(other_cpus, ipi); 1227} 1228 1229int 1230ipi_nmi_handler() 1231{ 1232 u_int cpuid; 1233 1234 /* 1235 * As long as there is not a simple way to know about a NMI's 1236 * source, if the bitmask for the current CPU is present in 1237 * the global pending bitword an IPI_STOP_HARD has been issued 1238 * and should be handled. 1239 */ 1240 cpuid = PCPU_GET(cpuid); 1241 if (!CPU_ISSET(cpuid, &ipi_nmi_pending)) 1242 return (1); 1243 1244 CPU_CLR_ATOMIC(cpuid, &ipi_nmi_pending); 1245 cpustop_handler(); 1246 return (0); 1247} 1248 1249/* 1250 * Handle an IPI_STOP by saving our current context and spinning until we 1251 * are resumed. 1252 */ 1253void 1254cpustop_handler(void) 1255{ 1256 int cpu; 1257 1258 cpu = PCPU_GET(cpuid); 1259 1260 savectx(&stoppcbs[cpu]); 1261 1262 /* Indicate that we are stopped */ 1263 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1264 1265 /* Wait for restart */ 1266 while (!CPU_ISSET(cpu, &started_cpus)) 1267 ia32_pause(); 1268 1269 CPU_CLR_ATOMIC(cpu, &started_cpus); 1270 CPU_CLR_ATOMIC(cpu, &stopped_cpus); 1271 1272 if (cpu == 0 && cpustop_restartfunc != NULL) { 1273 cpustop_restartfunc(); 1274 cpustop_restartfunc = NULL; 1275 } 1276} 1277 1278/* 1279 * Handlers for TLB related IPIs 1280 * 1281 * On i386 Xen PV this are no-ops since this port doesn't support SMP. 1282 */ 1283void 1284invltlb_handler(void) 1285{ 1286} 1287 1288void 1289invlpg_handler(void) 1290{ 1291} 1292 1293void 1294invlrng_handler(void) 1295{ 1296} 1297 1298void 1299invlcache_handler(void) 1300{ 1301} 1302 1303/* 1304 * This is called once the rest of the system is up and running and we're 1305 * ready to let the AP's out of the pen. 1306 */ 1307static void 1308release_aps(void *dummy __unused) 1309{ 1310 1311 if (mp_ncpus == 1) 1312 return; 1313 atomic_store_rel_int(&aps_ready, 1); 1314 while (smp_started == 0) 1315 ia32_pause(); 1316} 1317SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1318SYSINIT(start_ipis, SI_SUB_SMP, SI_ORDER_ANY, xen_smp_intr_init_cpus, NULL); 1319SYSINIT(start_cpu, SI_SUB_INTR, SI_ORDER_ANY, xen_smp_intr_setup_cpus, NULL); 1320