mp_machdep.c revision 222813
1/*-
2 * Copyright (c) 1996, by Steve Passe
3 * Copyright (c) 2008, by Kip Macy
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. The name of the developer may NOT be used to endorse or promote products
12 *    derived from this software without specific prior written permission.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/i386/xen/mp_machdep.c 222813 2011-06-07 08:46:13Z attilio $");
29
30#include "opt_apic.h"
31#include "opt_cpu.h"
32#include "opt_kstack_pages.h"
33#include "opt_mp_watchdog.h"
34#include "opt_pmap.h"
35#include "opt_sched.h"
36#include "opt_smp.h"
37
38#if !defined(lint)
39#if !defined(SMP)
40#error How did you get here?
41#endif
42
43#ifndef DEV_APIC
44#error The apic device is required for SMP, add "device apic" to your config file.
45#endif
46#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT)
47#error SMP not supported with CPU_DISABLE_CMPXCHG
48#endif
49#endif /* not lint */
50
51#include <sys/param.h>
52#include <sys/systm.h>
53#include <sys/bus.h>
54#include <sys/cons.h>	/* cngetc() */
55#include <sys/cpuset.h>
56#ifdef GPROF
57#include <sys/gmon.h>
58#endif
59#include <sys/kernel.h>
60#include <sys/ktr.h>
61#include <sys/lock.h>
62#include <sys/malloc.h>
63#include <sys/memrange.h>
64#include <sys/mutex.h>
65#include <sys/pcpu.h>
66#include <sys/proc.h>
67#include <sys/sched.h>
68#include <sys/smp.h>
69#include <sys/sysctl.h>
70
71#include <vm/vm.h>
72#include <vm/vm_param.h>
73#include <vm/pmap.h>
74#include <vm/vm_kern.h>
75#include <vm/vm_extern.h>
76#include <vm/vm_page.h>
77
78#include <x86/apicreg.h>
79#include <machine/md_var.h>
80#include <machine/mp_watchdog.h>
81#include <machine/pcb.h>
82#include <machine/psl.h>
83#include <machine/smp.h>
84#include <machine/specialreg.h>
85#include <machine/pcpu.h>
86
87
88
89#include <machine/xen/xen-os.h>
90#include <xen/evtchn.h>
91#include <xen/xen_intr.h>
92#include <xen/hypervisor.h>
93#include <xen/interface/vcpu.h>
94
95
96int	mp_naps;		/* # of Applications processors */
97int	boot_cpu_id = -1;	/* designated BSP */
98
99extern	struct pcpu __pcpu[];
100
101static int bootAP;
102static union descriptor *bootAPgdt;
103
104static char resched_name[NR_CPUS][15];
105static char callfunc_name[NR_CPUS][15];
106
107/* Free these after use */
108void *bootstacks[MAXCPU];
109
110struct pcb stoppcbs[MAXCPU];
111
112/* Variables needed for SMP tlb shootdown. */
113vm_offset_t smp_tlb_addr1;
114vm_offset_t smp_tlb_addr2;
115volatile int smp_tlb_wait;
116
117typedef void call_data_func_t(uintptr_t , uintptr_t);
118
119static u_int logical_cpus;
120static volatile cpuset_t ipi_nmi_pending;
121
122/* used to hold the AP's until we are ready to release them */
123static struct mtx ap_boot_mtx;
124
125/* Set to 1 once we're ready to let the APs out of the pen. */
126static volatile int aps_ready = 0;
127
128/*
129 * Store data from cpu_add() until later in the boot when we actually setup
130 * the APs.
131 */
132struct cpu_info {
133	int	cpu_present:1;
134	int	cpu_bsp:1;
135	int	cpu_disabled:1;
136} static cpu_info[MAX_APIC_ID + 1];
137int cpu_apic_ids[MAXCPU];
138int apic_cpuids[MAX_APIC_ID + 1];
139
140/* Holds pending bitmap based IPIs per CPU */
141static volatile u_int cpu_ipi_pending[MAXCPU];
142
143static int cpu_logical;
144static int cpu_cores;
145
146static void	assign_cpu_ids(void);
147static void	set_interrupt_apic_ids(void);
148int	start_all_aps(void);
149static int	start_ap(int apic_id);
150static void	release_aps(void *dummy);
151
152static u_int	hyperthreading_cpus;
153static cpuset_t	hyperthreading_cpus_mask;
154
155extern void Xhypervisor_callback(void);
156extern void failsafe_callback(void);
157extern void pmap_lazyfix_action(void);
158
159struct cpu_group *
160cpu_topo(void)
161{
162	if (cpu_cores == 0)
163		cpu_cores = 1;
164	if (cpu_logical == 0)
165		cpu_logical = 1;
166	if (mp_ncpus % (cpu_cores * cpu_logical) != 0) {
167		printf("WARNING: Non-uniform processors.\n");
168		printf("WARNING: Using suboptimal topology.\n");
169		return (smp_topo_none());
170	}
171	/*
172	 * No multi-core or hyper-threaded.
173	 */
174	if (cpu_logical * cpu_cores == 1)
175		return (smp_topo_none());
176	/*
177	 * Only HTT no multi-core.
178	 */
179	if (cpu_logical > 1 && cpu_cores == 1)
180		return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
181	/*
182	 * Only multi-core no HTT.
183	 */
184	if (cpu_cores > 1 && cpu_logical == 1)
185		return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0));
186	/*
187	 * Both HTT and multi-core.
188	 */
189	return (smp_topo_2level(CG_SHARE_NONE, cpu_cores,
190	    CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
191}
192
193/*
194 * Calculate usable address in base memory for AP trampoline code.
195 */
196u_int
197mp_bootaddress(u_int basemem)
198{
199
200	return (basemem);
201}
202
203void
204cpu_add(u_int apic_id, char boot_cpu)
205{
206
207	if (apic_id > MAX_APIC_ID) {
208		panic("SMP: APIC ID %d too high", apic_id);
209		return;
210	}
211	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
212	    apic_id));
213	cpu_info[apic_id].cpu_present = 1;
214	if (boot_cpu) {
215		KASSERT(boot_cpu_id == -1,
216		    ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
217		    boot_cpu_id));
218		boot_cpu_id = apic_id;
219		cpu_info[apic_id].cpu_bsp = 1;
220	}
221	if (mp_ncpus < MAXCPU)
222		mp_ncpus++;
223	if (bootverbose)
224		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
225		    "AP");
226}
227
228void
229cpu_mp_setmaxid(void)
230{
231
232	mp_maxid = MAXCPU - 1;
233}
234
235int
236cpu_mp_probe(void)
237{
238
239	/*
240	 * Always record BSP in CPU map so that the mbuf init code works
241	 * correctly.
242	 */
243	CPU_SETOF(0, &all_cpus);
244	if (mp_ncpus == 0) {
245		/*
246		 * No CPUs were found, so this must be a UP system.  Setup
247		 * the variables to represent a system with a single CPU
248		 * with an id of 0.
249		 */
250		mp_ncpus = 1;
251		return (0);
252	}
253
254	/* At least one CPU was found. */
255	if (mp_ncpus == 1) {
256		/*
257		 * One CPU was found, so this must be a UP system with
258		 * an I/O APIC.
259		 */
260		return (0);
261	}
262
263	/* At least two CPUs were found. */
264	return (1);
265}
266
267/*
268 * Initialize the IPI handlers and start up the AP's.
269 */
270void
271cpu_mp_start(void)
272{
273	int i;
274
275	/* Initialize the logical ID to APIC ID table. */
276	for (i = 0; i < MAXCPU; i++) {
277		cpu_apic_ids[i] = -1;
278		cpu_ipi_pending[i] = 0;
279	}
280
281	/* Set boot_cpu_id if needed. */
282	if (boot_cpu_id == -1) {
283		boot_cpu_id = PCPU_GET(apic_id);
284		cpu_info[boot_cpu_id].cpu_bsp = 1;
285	} else
286		KASSERT(boot_cpu_id == PCPU_GET(apic_id),
287		    ("BSP's APIC ID doesn't match boot_cpu_id"));
288	cpu_apic_ids[0] = boot_cpu_id;
289	apic_cpuids[boot_cpu_id] = 0;
290
291	assign_cpu_ids();
292
293	/* Start each Application Processor */
294	start_all_aps();
295
296	/* Setup the initial logical CPUs info. */
297	logical_cpus = 0;
298	CPU_ZERO(&logical_cpus_mask);
299	if (cpu_feature & CPUID_HTT)
300		logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
301
302	set_interrupt_apic_ids();
303}
304
305
306static void
307iv_rendezvous(uintptr_t a, uintptr_t b)
308{
309	smp_rendezvous_action();
310}
311
312static void
313iv_invltlb(uintptr_t a, uintptr_t b)
314{
315	xen_tlb_flush();
316}
317
318static void
319iv_invlpg(uintptr_t a, uintptr_t b)
320{
321	xen_invlpg(a);
322}
323
324static void
325iv_invlrng(uintptr_t a, uintptr_t b)
326{
327	vm_offset_t start = (vm_offset_t)a;
328	vm_offset_t end = (vm_offset_t)b;
329
330	while (start < end) {
331		xen_invlpg(start);
332		start += PAGE_SIZE;
333	}
334}
335
336
337static void
338iv_invlcache(uintptr_t a, uintptr_t b)
339{
340
341	wbinvd();
342	atomic_add_int(&smp_tlb_wait, 1);
343}
344
345static void
346iv_lazypmap(uintptr_t a, uintptr_t b)
347{
348	pmap_lazyfix_action();
349	atomic_add_int(&smp_tlb_wait, 1);
350}
351
352/*
353 * These start from "IPI offset" APIC_IPI_INTS
354 */
355static call_data_func_t *ipi_vectors[6] =
356{
357  iv_rendezvous,
358  iv_invltlb,
359  iv_invlpg,
360  iv_invlrng,
361  iv_invlcache,
362  iv_lazypmap,
363};
364
365/*
366 * Reschedule call back. Nothing to do,
367 * all the work is done automatically when
368 * we return from the interrupt.
369 */
370static int
371smp_reschedule_interrupt(void *unused)
372{
373	int cpu = PCPU_GET(cpuid);
374	u_int ipi_bitmap;
375
376	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
377
378	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
379#ifdef COUNT_IPIS
380		(*ipi_preempt_counts[cpu])++;
381#endif
382		sched_preempt(curthread);
383	}
384
385	if (ipi_bitmap & (1 << IPI_AST)) {
386#ifdef COUNT_IPIS
387		(*ipi_ast_counts[cpu])++;
388#endif
389		/* Nothing to do for AST */
390	}
391	return (FILTER_HANDLED);
392}
393
394struct _call_data {
395	uint16_t func_id;
396	uint16_t wait;
397	uintptr_t arg1;
398	uintptr_t arg2;
399	atomic_t started;
400	atomic_t finished;
401};
402
403static struct _call_data *call_data;
404
405static int
406smp_call_function_interrupt(void *unused)
407{
408	call_data_func_t *func;
409	uintptr_t arg1 = call_data->arg1;
410	uintptr_t arg2 = call_data->arg2;
411	int wait = call_data->wait;
412	atomic_t *started = &call_data->started;
413	atomic_t *finished = &call_data->finished;
414
415	/* We only handle function IPIs, not bitmap IPIs */
416	if (call_data->func_id < APIC_IPI_INTS || call_data->func_id > IPI_BITMAP_VECTOR)
417		panic("invalid function id %u", call_data->func_id);
418
419	func = ipi_vectors[call_data->func_id - APIC_IPI_INTS];
420	/*
421	 * Notify initiating CPU that I've grabbed the data and am
422	 * about to execute the function
423	 */
424	mb();
425	atomic_inc(started);
426	/*
427	 * At this point the info structure may be out of scope unless wait==1
428	 */
429	(*func)(arg1, arg2);
430
431	if (wait) {
432		mb();
433		atomic_inc(finished);
434	}
435	atomic_add_int(&smp_tlb_wait, 1);
436	return (FILTER_HANDLED);
437}
438
439/*
440 * Print various information about the SMP system hardware and setup.
441 */
442void
443cpu_mp_announce(void)
444{
445	int i, x;
446
447	/* List CPUs */
448	printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
449	for (i = 1, x = 0; x <= MAX_APIC_ID; x++) {
450		if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp)
451			continue;
452		if (cpu_info[x].cpu_disabled)
453			printf("  cpu (AP): APIC ID: %2d (disabled)\n", x);
454		else {
455			KASSERT(i < mp_ncpus,
456			    ("mp_ncpus and actual cpus are out of whack"));
457			printf(" cpu%d (AP): APIC ID: %2d\n", i++, x);
458		}
459	}
460}
461
462static int
463xen_smp_intr_init(unsigned int cpu)
464{
465	int rc;
466	unsigned int irq;
467
468	per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
469
470	sprintf(resched_name[cpu], "resched%u", cpu);
471	rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
472				    cpu,
473				    resched_name[cpu],
474				    smp_reschedule_interrupt,
475	    INTR_TYPE_TTY, &irq);
476
477	printf("[XEN] IPI cpu=%d irq=%d vector=RESCHEDULE_VECTOR (%d)\n",
478	    cpu, irq, RESCHEDULE_VECTOR);
479
480	per_cpu(resched_irq, cpu) = irq;
481
482	sprintf(callfunc_name[cpu], "callfunc%u", cpu);
483	rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR,
484				    cpu,
485				    callfunc_name[cpu],
486				    smp_call_function_interrupt,
487	    INTR_TYPE_TTY, &irq);
488	if (rc < 0)
489		goto fail;
490	per_cpu(callfunc_irq, cpu) = irq;
491
492	printf("[XEN] IPI cpu=%d irq=%d vector=CALL_FUNCTION_VECTOR (%d)\n",
493	    cpu, irq, CALL_FUNCTION_VECTOR);
494
495
496	if ((cpu != 0) && ((rc = ap_cpu_initclocks(cpu)) != 0))
497		goto fail;
498
499	return 0;
500
501 fail:
502	if (per_cpu(resched_irq, cpu) >= 0)
503		unbind_from_irqhandler(per_cpu(resched_irq, cpu));
504	if (per_cpu(callfunc_irq, cpu) >= 0)
505		unbind_from_irqhandler(per_cpu(callfunc_irq, cpu));
506	return rc;
507}
508
509static void
510xen_smp_intr_init_cpus(void *unused)
511{
512	int i;
513
514	for (i = 0; i < mp_ncpus; i++)
515		xen_smp_intr_init(i);
516}
517
518#define MTOPSIZE (1<<(14 + PAGE_SHIFT))
519
520/*
521 * AP CPU's call this to initialize themselves.
522 */
523void
524init_secondary(void)
525{
526	cpuset_t tcpuset, tallcpus;
527	vm_offset_t addr;
528	int	gsel_tss;
529
530
531	/* bootAP is set in start_ap() to our ID. */
532	PCPU_SET(currentldt, _default_ldt);
533	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
534#if 0
535	gdt[bootAP * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
536#endif
537	PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */
538	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
539	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
540#if 0
541	PCPU_SET(tss_gdt, &gdt[bootAP * NGDT + GPROC0_SEL].sd);
542
543	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
544#endif
545	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
546
547	/*
548	 * Set to a known state:
549	 * Set by mpboot.s: CR0_PG, CR0_PE
550	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
551	 */
552	/*
553	 * signal our startup to the BSP.
554	 */
555	mp_naps++;
556
557	/* Spin until the BSP releases the AP's. */
558	while (!aps_ready)
559		ia32_pause();
560
561	/* BSP may have changed PTD while we were waiting */
562	invltlb();
563	for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE)
564		invlpg(addr);
565
566	/* set up FPU state on the AP */
567	npxinit();
568#if 0
569
570	/* set up SSE registers */
571	enable_sse();
572#endif
573#if 0 && defined(PAE)
574	/* Enable the PTE no-execute bit. */
575	if ((amd_feature & AMDID_NX) != 0) {
576		uint64_t msr;
577
578		msr = rdmsr(MSR_EFER) | EFER_NXE;
579		wrmsr(MSR_EFER, msr);
580	}
581#endif
582#if 0
583	/* A quick check from sanity claus */
584	if (PCPU_GET(apic_id) != lapic_id()) {
585		printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
586		printf("SMP: actual apic_id = %d\n", lapic_id());
587		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
588		panic("cpuid mismatch! boom!!");
589	}
590#endif
591
592	/* Initialize curthread. */
593	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
594	PCPU_SET(curthread, PCPU_GET(idlethread));
595
596	mtx_lock_spin(&ap_boot_mtx);
597#if 0
598
599	/* Init local apic for irq's */
600	lapic_setup(1);
601#endif
602	smp_cpus++;
603
604	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid));
605	printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
606	tcpuset = PCPU_GET(cpumask);
607
608	/* Determine if we are a logical CPU. */
609	if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0)
610		CPU_OR(&logical_cpus_mask, &tcpuset);
611
612	/* Determine if we are a hyperthread. */
613	if (hyperthreading_cpus > 1 &&
614	    PCPU_GET(apic_id) % hyperthreading_cpus != 0)
615		CPU_OR(&hyperthreading_cpus_mask, &tcpuset);
616
617	/* Build our map of 'other' CPUs. */
618	tallcpus = all_cpus;
619	CPU_NAND(&tallcpus, &tcpuset);
620	PCPU_SET(other_cpus, tallcpus);
621#if 0
622	if (bootverbose)
623		lapic_dump("AP");
624#endif
625	if (smp_cpus == mp_ncpus) {
626		/* enable IPI's, tlb shootdown, freezes etc */
627		atomic_store_rel_int(&smp_started, 1);
628		smp_active = 1;	 /* historic */
629	}
630
631	mtx_unlock_spin(&ap_boot_mtx);
632
633	/* wait until all the AP's are up */
634	while (smp_started == 0)
635		ia32_pause();
636
637	PCPU_SET(curthread, PCPU_GET(idlethread));
638
639	/* Start per-CPU event timers. */
640	cpu_initclocks_ap();
641
642	/* enter the scheduler */
643	sched_throw(NULL);
644
645	panic("scheduler returned us to %s", __func__);
646	/* NOTREACHED */
647}
648
649/*******************************************************************
650 * local functions and data
651 */
652
653/*
654 * We tell the I/O APIC code about all the CPUs we want to receive
655 * interrupts.  If we don't want certain CPUs to receive IRQs we
656 * can simply not tell the I/O APIC code about them in this function.
657 * We also do not tell it about the BSP since it tells itself about
658 * the BSP internally to work with UP kernels and on UP machines.
659 */
660static void
661set_interrupt_apic_ids(void)
662{
663	u_int i, apic_id;
664
665	for (i = 0; i < MAXCPU; i++) {
666		apic_id = cpu_apic_ids[i];
667		if (apic_id == -1)
668			continue;
669		if (cpu_info[apic_id].cpu_bsp)
670			continue;
671		if (cpu_info[apic_id].cpu_disabled)
672			continue;
673
674		/* Don't let hyperthreads service interrupts. */
675		if (hyperthreading_cpus > 1 &&
676		    apic_id % hyperthreading_cpus != 0)
677			continue;
678
679		intr_add_cpu(i);
680	}
681}
682
683/*
684 * Assign logical CPU IDs to local APICs.
685 */
686static void
687assign_cpu_ids(void)
688{
689	u_int i;
690
691	/* Check for explicitly disabled CPUs. */
692	for (i = 0; i <= MAX_APIC_ID; i++) {
693		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp)
694			continue;
695
696		/* Don't use this CPU if it has been disabled by a tunable. */
697		if (resource_disabled("lapic", i)) {
698			cpu_info[i].cpu_disabled = 1;
699			continue;
700		}
701	}
702
703	/*
704	 * Assign CPU IDs to local APIC IDs and disable any CPUs
705	 * beyond MAXCPU.  CPU 0 has already been assigned to the BSP,
706	 * so we only have to assign IDs for APs.
707	 */
708	mp_ncpus = 1;
709	for (i = 0; i <= MAX_APIC_ID; i++) {
710		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp ||
711		    cpu_info[i].cpu_disabled)
712			continue;
713
714		if (mp_ncpus < MAXCPU) {
715			cpu_apic_ids[mp_ncpus] = i;
716			apic_cpuids[i] = mp_ncpus;
717			mp_ncpus++;
718		} else
719			cpu_info[i].cpu_disabled = 1;
720	}
721	KASSERT(mp_maxid >= mp_ncpus - 1,
722	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
723	    mp_ncpus));
724}
725
726/*
727 * start each AP in our list
728 */
729/* Lowest 1MB is already mapped: don't touch*/
730#define TMPMAP_START 1
731int
732start_all_aps(void)
733{
734	cpuset_t tallcpus;
735	int x,apic_id, cpu;
736	struct pcpu *pc;
737
738	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
739
740	/* set up temporary P==V mapping for AP boot */
741	/* XXX this is a hack, we should boot the AP on its own stack/PTD */
742
743	/* start each AP */
744	for (cpu = 1; cpu < mp_ncpus; cpu++) {
745		apic_id = cpu_apic_ids[cpu];
746
747
748		bootAP = cpu;
749		bootAPgdt = gdt + (512*cpu);
750
751		/* Get per-cpu data */
752		pc = &__pcpu[bootAP];
753		pcpu_init(pc, bootAP, sizeof(struct pcpu));
754		dpcpu_init((void *)kmem_alloc(kernel_map, DPCPU_SIZE), bootAP);
755		pc->pc_apic_id = cpu_apic_ids[bootAP];
756		pc->pc_prvspace = pc;
757		pc->pc_curthread = 0;
758
759		gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
760		gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
761
762		PT_SET_MA(bootAPgdt, VTOM(bootAPgdt) | PG_V | PG_RW);
763		bzero(bootAPgdt, PAGE_SIZE);
764		for (x = 0; x < NGDT; x++)
765			ssdtosd(&gdt_segs[x], &bootAPgdt[x].sd);
766		PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V);
767#ifdef notyet
768
769                if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) {
770                        apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id);
771                        acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id);
772#ifdef CONFIG_ACPI
773                        if (acpiid != 0xff)
774                                x86_acpiid_to_apicid[acpiid] = apicid;
775#endif
776                }
777#endif
778
779		/* attempt to start the Application Processor */
780		if (!start_ap(cpu)) {
781			printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id);
782			/* better panic as the AP may be running loose */
783			printf("panic y/n? [y] ");
784			if (cngetc() != 'n')
785				panic("bye-bye");
786		}
787
788		CPU_SET(cpu, &all_cpus);	/* record AP in CPU map */
789	}
790
791
792	/* build our map of 'other' CPUs */
793	tallcpus = all_cpus;
794	CPU_NAND(&tallcpus, PCPU_PTR(cpumask));
795	PCPU_SET(other_cpus, tallcpus);
796
797	pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1);
798
799	/* number of APs actually started */
800	return mp_naps;
801}
802
803extern uint8_t *pcpu_boot_stack;
804extern trap_info_t trap_table[];
805
806static void
807smp_trap_init(trap_info_t *trap_ctxt)
808{
809        const trap_info_t *t = trap_table;
810
811        for (t = trap_table; t->address; t++) {
812                trap_ctxt[t->vector].flags = t->flags;
813                trap_ctxt[t->vector].cs = t->cs;
814                trap_ctxt[t->vector].address = t->address;
815        }
816}
817
818extern int nkpt;
819static void
820cpu_initialize_context(unsigned int cpu)
821{
822	/* vcpu_guest_context_t is too large to allocate on the stack.
823	 * Hence we allocate statically and protect it with a lock */
824	vm_page_t m[4];
825	static vcpu_guest_context_t ctxt;
826	vm_offset_t boot_stack;
827	vm_offset_t newPTD;
828	vm_paddr_t ma[NPGPTD];
829	static int color;
830	int i;
831
832	/*
833	 * Page 0,[0-3]	PTD
834	 * Page 1, [4]	boot stack
835	 * Page [5]	PDPT
836	 *
837	 */
838	for (i = 0; i < NPGPTD + 2; i++) {
839		m[i] = vm_page_alloc(NULL, color++,
840		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
841		    VM_ALLOC_ZERO);
842
843		pmap_zero_page(m[i]);
844
845	}
846	boot_stack = kmem_alloc_nofault(kernel_map, 1);
847	newPTD = kmem_alloc_nofault(kernel_map, NPGPTD);
848	ma[0] = VM_PAGE_TO_MACH(m[0])|PG_V;
849
850#ifdef PAE
851	pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD + 1]));
852	for (i = 0; i < NPGPTD; i++) {
853		((vm_paddr_t *)boot_stack)[i] =
854		ma[i] = VM_PAGE_TO_MACH(m[i])|PG_V;
855	}
856#endif
857
858	/*
859	 * Copy cpu0 IdlePTD to new IdlePTD - copying only
860	 * kernel mappings
861	 */
862	pmap_qenter(newPTD, m, 4);
863
864	memcpy((uint8_t *)newPTD + KPTDI*sizeof(vm_paddr_t),
865	    (uint8_t *)PTOV(IdlePTD) + KPTDI*sizeof(vm_paddr_t),
866	    nkpt*sizeof(vm_paddr_t));
867
868	pmap_qremove(newPTD, 4);
869	kmem_free(kernel_map, newPTD, 4);
870	/*
871	 * map actual idle stack to boot_stack
872	 */
873	pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD]));
874
875
876	xen_pgdpt_pin(VM_PAGE_TO_MACH(m[NPGPTD + 1]));
877	vm_page_lock_queues();
878	for (i = 0; i < 4; i++) {
879		int pdir = (PTDPTDI + i) / NPDEPG;
880		int curoffset = (PTDPTDI + i) % NPDEPG;
881
882		xen_queue_pt_update((vm_paddr_t)
883		    ((ma[pdir] & ~PG_V) + (curoffset*sizeof(vm_paddr_t))),
884		    ma[i]);
885	}
886	PT_UPDATES_FLUSH();
887	vm_page_unlock_queues();
888
889	memset(&ctxt, 0, sizeof(ctxt));
890	ctxt.flags = VGCF_IN_KERNEL;
891	ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL);
892	ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL);
893	ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL);
894	ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL);
895	ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL);
896	ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL);
897	ctxt.user_regs.eip = (unsigned long)init_secondary;
898	ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */
899
900	memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
901
902	smp_trap_init(ctxt.trap_ctxt);
903
904	ctxt.ldt_ents = 0;
905	ctxt.gdt_frames[0] = (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT);
906	ctxt.gdt_ents      = 512;
907
908#ifdef __i386__
909	ctxt.user_regs.esp = boot_stack + PAGE_SIZE;
910
911	ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL);
912	ctxt.kernel_sp = boot_stack + PAGE_SIZE;
913
914	ctxt.event_callback_cs     = GSEL(GCODE_SEL, SEL_KPL);
915	ctxt.event_callback_eip    = (unsigned long)Xhypervisor_callback;
916	ctxt.failsafe_callback_cs  = GSEL(GCODE_SEL, SEL_KPL);
917	ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
918
919	ctxt.ctrlreg[3] = VM_PAGE_TO_MACH(m[NPGPTD + 1]);
920#else /* __x86_64__ */
921	ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
922	ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL);
923	ctxt.kernel_sp = idle->thread.rsp0;
924
925	ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
926	ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
927	ctxt.syscall_callback_eip  = (unsigned long)system_call;
928
929	ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
930
931	ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu));
932#endif
933
934	printf("gdtpfn=%lx pdptpfn=%lx\n",
935	    ctxt.gdt_frames[0],
936	    ctxt.ctrlreg[3] >> PAGE_SHIFT);
937
938	PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt));
939	DELAY(3000);
940	PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL));
941}
942
943/*
944 * This function starts the AP (application processor) identified
945 * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
946 * to accomplish this.  This is necessary because of the nuances
947 * of the different hardware we might encounter.  It isn't pretty,
948 * but it seems to work.
949 */
950
951int cpus;
952static int
953start_ap(int apic_id)
954{
955	int ms;
956
957	/* used as a watchpoint to signal AP startup */
958	cpus = mp_naps;
959
960	cpu_initialize_context(apic_id);
961
962	/* Wait up to 5 seconds for it to start. */
963	for (ms = 0; ms < 5000; ms++) {
964		if (mp_naps > cpus)
965			return 1;	/* return SUCCESS */
966		DELAY(1000);
967	}
968	return 0;		/* return FAILURE */
969}
970
971/*
972 * send an IPI to a specific CPU.
973 */
974static void
975ipi_send_cpu(int cpu, u_int ipi)
976{
977	u_int bitmap, old_pending, new_pending;
978
979	if (IPI_IS_BITMAPED(ipi)) {
980		bitmap = 1 << ipi;
981		ipi = IPI_BITMAP_VECTOR;
982		do {
983			old_pending = cpu_ipi_pending[cpu];
984			new_pending = old_pending | bitmap;
985		} while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],
986		    old_pending, new_pending));
987		if (!old_pending)
988			ipi_pcpu(cpu, RESCHEDULE_VECTOR);
989	} else {
990		KASSERT(call_data != NULL, ("call_data not set"));
991		ipi_pcpu(cpu, CALL_FUNCTION_VECTOR);
992	}
993}
994
995/*
996 * Flush the TLB on all other CPU's
997 */
998static void
999smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
1000{
1001	u_int ncpu;
1002	struct _call_data data;
1003
1004	ncpu = mp_ncpus - 1;	/* does not shootdown self */
1005	if (ncpu < 1)
1006		return;		/* no other cpus */
1007	if (!(read_eflags() & PSL_I))
1008		panic("%s: interrupts disabled", __func__);
1009	mtx_lock_spin(&smp_ipi_mtx);
1010	KASSERT(call_data == NULL, ("call_data isn't null?!"));
1011	call_data = &data;
1012	call_data->func_id = vector;
1013	call_data->arg1 = addr1;
1014	call_data->arg2 = addr2;
1015	atomic_store_rel_int(&smp_tlb_wait, 0);
1016	ipi_all_but_self(vector);
1017	while (smp_tlb_wait < ncpu)
1018		ia32_pause();
1019	call_data = NULL;
1020	mtx_unlock_spin(&smp_ipi_mtx);
1021}
1022
1023static void
1024smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
1025{
1026	int cpu, ncpu, othercpus;
1027	struct _call_data data;
1028
1029	othercpus = mp_ncpus - 1;
1030	if (CPU_ISFULLSET(&mask)) {
1031		if (othercpus < 1)
1032			return;
1033	} else {
1034		critical_enter();
1035		CPU_NAND(&mask, PCPU_PTR(cpumask));
1036		critical_exit();
1037		if (CPU_EMPTY(&mask))
1038			return;
1039	}
1040	if (!(read_eflags() & PSL_I))
1041		panic("%s: interrupts disabled", __func__);
1042	mtx_lock_spin(&smp_ipi_mtx);
1043	KASSERT(call_data == NULL, ("call_data isn't null?!"));
1044	call_data = &data;
1045	call_data->func_id = vector;
1046	call_data->arg1 = addr1;
1047	call_data->arg2 = addr2;
1048	atomic_store_rel_int(&smp_tlb_wait, 0);
1049	if (CPU_ISFULLSET(&mask)) {
1050		ncpu = othercpus;
1051		ipi_all_but_self(vector);
1052	} else {
1053		ncpu = 0;
1054		while ((cpu = cpusetobj_ffs(&mask)) != 0) {
1055			cpu--;
1056			CPU_CLR(cpu, &mask);
1057			CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu,
1058			    vector);
1059			ipi_send_cpu(cpu, vector);
1060			ncpu++;
1061		}
1062	}
1063	while (smp_tlb_wait < ncpu)
1064		ia32_pause();
1065	call_data = NULL;
1066	mtx_unlock_spin(&smp_ipi_mtx);
1067}
1068
1069void
1070smp_cache_flush(void)
1071{
1072
1073	if (smp_started)
1074		smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
1075}
1076
1077void
1078smp_invltlb(void)
1079{
1080
1081	if (smp_started) {
1082		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
1083	}
1084}
1085
1086void
1087smp_invlpg(vm_offset_t addr)
1088{
1089
1090	if (smp_started) {
1091		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
1092	}
1093}
1094
1095void
1096smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
1097{
1098
1099	if (smp_started) {
1100		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
1101	}
1102}
1103
1104void
1105smp_masked_invltlb(cpuset_t mask)
1106{
1107
1108	if (smp_started) {
1109		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
1110	}
1111}
1112
1113void
1114smp_masked_invlpg(cpuset_t mask, vm_offset_t addr)
1115{
1116
1117	if (smp_started) {
1118		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
1119	}
1120}
1121
1122void
1123smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2)
1124{
1125
1126	if (smp_started) {
1127		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
1128	}
1129}
1130
1131/*
1132 * send an IPI to a set of cpus.
1133 */
1134void
1135ipi_selected(cpuset_t cpus, u_int ipi)
1136{
1137	int cpu;
1138
1139	/*
1140	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1141	 * of help in order to understand what is the source.
1142	 * Set the mask of receiving CPUs for this purpose.
1143	 */
1144	if (ipi == IPI_STOP_HARD)
1145		CPU_OR_ATOMIC(&ipi_nmi_pending, &cpus);
1146
1147	while ((cpu = cpusetobj_ffs(&cpus)) != 0) {
1148		cpu--;
1149		CPU_CLR(cpu, &cpus);
1150		CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1151		ipi_send_cpu(cpu, ipi);
1152	}
1153}
1154
1155/*
1156 * send an IPI to a specific CPU.
1157 */
1158void
1159ipi_cpu(int cpu, u_int ipi)
1160{
1161
1162	/*
1163	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1164	 * of help in order to understand what is the source.
1165	 * Set the mask of receiving CPUs for this purpose.
1166	 */
1167	if (ipi == IPI_STOP_HARD)
1168		CPU_SET_ATOMIC(cpu, &ipi_nmi_pending);
1169
1170	CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1171	ipi_send_cpu(cpu, ipi);
1172}
1173
1174/*
1175 * send an IPI to all CPUs EXCEPT myself
1176 */
1177void
1178ipi_all_but_self(u_int ipi)
1179{
1180	cpuset_t other_cpus;
1181
1182	/*
1183	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1184	 * of help in order to understand what is the source.
1185	 * Set the mask of receiving CPUs for this purpose.
1186	 */
1187	sched_pin();
1188	other_cpus = PCPU_GET(other_cpus);
1189	sched_unpin();
1190	if (ipi == IPI_STOP_HARD)
1191		CPU_OR_ATOMIC(&ipi_nmi_pending, &other_cpus);
1192
1193	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1194	ipi_selected(other_cpus, ipi);
1195}
1196
1197int
1198ipi_nmi_handler()
1199{
1200	cpuset_t cpumask;
1201
1202	/*
1203	 * As long as there is not a simple way to know about a NMI's
1204	 * source, if the bitmask for the current CPU is present in
1205	 * the global pending bitword an IPI_STOP_HARD has been issued
1206	 * and should be handled.
1207	 */
1208	sched_pin();
1209	cpumask = PCPU_GET(cpumask);
1210	sched_unpin();
1211	if (!CPU_OVERLAP(&ipi_nmi_pending, &cpumask))
1212		return (1);
1213
1214	CPU_NAND_ATOMIC(&ipi_nmi_pending, &cpumask);
1215	cpustop_handler();
1216	return (0);
1217}
1218
1219/*
1220 * Handle an IPI_STOP by saving our current context and spinning until we
1221 * are resumed.
1222 */
1223void
1224cpustop_handler(void)
1225{
1226	cpuset_t cpumask;
1227	int cpu;
1228
1229	sched_pin();
1230	cpumask = PCPU_GET(cpumask);
1231	cpu = PCPU_GET(cpuid);
1232	sched_unpin();
1233
1234	savectx(&stoppcbs[cpu]);
1235
1236	/* Indicate that we are stopped */
1237	CPU_OR_ATOMIC(&stopped_cpus, &cpumask);
1238
1239	/* Wait for restart */
1240	while (!CPU_OVERLAP(&started_cpus, &cpumask))
1241	    ia32_pause();
1242
1243	CPU_NAND_ATOMIC(&started_cpus, &cpumask);
1244	CPU_NAND_ATOMIC(&stopped_cpus, &cpumask);
1245
1246	if (cpu == 0 && cpustop_restartfunc != NULL) {
1247		cpustop_restartfunc();
1248		cpustop_restartfunc = NULL;
1249	}
1250}
1251
1252/*
1253 * This is called once the rest of the system is up and running and we're
1254 * ready to let the AP's out of the pen.
1255 */
1256static void
1257release_aps(void *dummy __unused)
1258{
1259
1260	if (mp_ncpus == 1)
1261		return;
1262	atomic_store_rel_int(&aps_ready, 1);
1263	while (smp_started == 0)
1264		ia32_pause();
1265}
1266SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1267SYSINIT(start_ipis, SI_SUB_INTR, SI_ORDER_ANY, xen_smp_intr_init_cpus, NULL);
1268
1269