mp_machdep.c revision 189420
1/*-
2 * Copyright (c) 1996, by Steve Passe
3 * Copyright (c) 2008, by Kip Macy
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. The name of the developer may NOT be used to endorse or promote products
12 *    derived from this software without specific prior written permission.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/i386/xen/mp_machdep.c 189420 2009-03-05 18:43:54Z jhb $");
29
30#include "opt_apic.h"
31#include "opt_cpu.h"
32#include "opt_kstack_pages.h"
33#include "opt_mp_watchdog.h"
34#include "opt_sched.h"
35#include "opt_smp.h"
36
37#if !defined(lint)
38#if !defined(SMP)
39#error How did you get here?
40#endif
41
42#ifndef DEV_APIC
43#error The apic device is required for SMP, add "device apic" to your config file.
44#endif
45#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT)
46#error SMP not supported with CPU_DISABLE_CMPXCHG
47#endif
48#endif /* not lint */
49
50#include <sys/param.h>
51#include <sys/systm.h>
52#include <sys/bus.h>
53#include <sys/cons.h>	/* cngetc() */
54#ifdef GPROF
55#include <sys/gmon.h>
56#endif
57#include <sys/kernel.h>
58#include <sys/ktr.h>
59#include <sys/lock.h>
60#include <sys/malloc.h>
61#include <sys/memrange.h>
62#include <sys/mutex.h>
63#include <sys/pcpu.h>
64#include <sys/proc.h>
65#include <sys/sched.h>
66#include <sys/smp.h>
67#include <sys/sysctl.h>
68
69#include <vm/vm.h>
70#include <vm/vm_param.h>
71#include <vm/pmap.h>
72#include <vm/vm_kern.h>
73#include <vm/vm_extern.h>
74#include <vm/vm_page.h>
75
76#include <machine/apicreg.h>
77#include <machine/md_var.h>
78#include <machine/mp_watchdog.h>
79#include <machine/pcb.h>
80#include <machine/psl.h>
81#include <machine/smp.h>
82#include <machine/specialreg.h>
83#include <machine/pcpu.h>
84
85
86
87#include <machine/xen/xen-os.h>
88#include <xen/evtchn.h>
89#include <xen/xen_intr.h>
90#include <xen/hypervisor.h>
91#include <xen/interface/vcpu.h>
92
93#define stop_cpus_with_nmi	0
94
95
96int	mp_naps;		/* # of Applications processors */
97int	boot_cpu_id = -1;	/* designated BSP */
98
99extern	struct pcpu __pcpu[];
100
101static int bootAP;
102static union descriptor *bootAPgdt;
103
104static char resched_name[NR_CPUS][15];
105static char callfunc_name[NR_CPUS][15];
106
107/* Free these after use */
108void *bootstacks[MAXCPU];
109
110/* Hotwire a 0->4MB V==P mapping */
111extern pt_entry_t *KPTphys;
112
113struct pcb stoppcbs[MAXCPU];
114
115/* Variables needed for SMP tlb shootdown. */
116vm_offset_t smp_tlb_addr1;
117vm_offset_t smp_tlb_addr2;
118volatile int smp_tlb_wait;
119
120typedef void call_data_func_t(uintptr_t , uintptr_t);
121
122static u_int logical_cpus;
123
124/* used to hold the AP's until we are ready to release them */
125static struct mtx ap_boot_mtx;
126
127/* Set to 1 once we're ready to let the APs out of the pen. */
128static volatile int aps_ready = 0;
129
130/*
131 * Store data from cpu_add() until later in the boot when we actually setup
132 * the APs.
133 */
134struct cpu_info {
135	int	cpu_present:1;
136	int	cpu_bsp:1;
137	int	cpu_disabled:1;
138} static cpu_info[MAX_APIC_ID + 1];
139int cpu_apic_ids[MAXCPU];
140int apic_cpuids[MAX_APIC_ID + 1];
141
142/* Holds pending bitmap based IPIs per CPU */
143static volatile u_int cpu_ipi_pending[MAXCPU];
144
145static void	assign_cpu_ids(void);
146static void	set_interrupt_apic_ids(void);
147int	start_all_aps(void);
148static int	start_ap(int apic_id);
149static void	release_aps(void *dummy);
150
151static u_int	hyperthreading_cpus;
152static cpumask_t	hyperthreading_cpus_mask;
153
154extern void Xhypervisor_callback(void);
155extern void failsafe_callback(void);
156extern void pmap_lazyfix_action(void);
157
158struct cpu_group *
159cpu_topo(void)
160{
161	if (cpu_cores == 0)
162		cpu_cores = 1;
163	if (cpu_logical == 0)
164		cpu_logical = 1;
165	if (mp_ncpus % (cpu_cores * cpu_logical) != 0) {
166		printf("WARNING: Non-uniform processors.\n");
167		printf("WARNING: Using suboptimal topology.\n");
168		return (smp_topo_none());
169	}
170	/*
171	 * No multi-core or hyper-threaded.
172	 */
173	if (cpu_logical * cpu_cores == 1)
174		return (smp_topo_none());
175	/*
176	 * Only HTT no multi-core.
177	 */
178	if (cpu_logical > 1 && cpu_cores == 1)
179		return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
180	/*
181	 * Only multi-core no HTT.
182	 */
183	if (cpu_cores > 1 && cpu_logical == 1)
184		return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0));
185	/*
186	 * Both HTT and multi-core.
187	 */
188	return (smp_topo_2level(CG_SHARE_NONE, cpu_cores,
189	    CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
190}
191
192/*
193 * Calculate usable address in base memory for AP trampoline code.
194 */
195u_int
196mp_bootaddress(u_int basemem)
197{
198
199	return (basemem);
200}
201
202void
203cpu_add(u_int apic_id, char boot_cpu)
204{
205
206	if (apic_id > MAX_APIC_ID) {
207		panic("SMP: APIC ID %d too high", apic_id);
208		return;
209	}
210	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
211	    apic_id));
212	cpu_info[apic_id].cpu_present = 1;
213	if (boot_cpu) {
214		KASSERT(boot_cpu_id == -1,
215		    ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
216		    boot_cpu_id));
217		boot_cpu_id = apic_id;
218		cpu_info[apic_id].cpu_bsp = 1;
219	}
220	if (mp_ncpus < MAXCPU)
221		mp_ncpus++;
222	if (bootverbose)
223		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
224		    "AP");
225}
226
227void
228cpu_mp_setmaxid(void)
229{
230
231	mp_maxid = MAXCPU - 1;
232}
233
234int
235cpu_mp_probe(void)
236{
237
238	/*
239	 * Always record BSP in CPU map so that the mbuf init code works
240	 * correctly.
241	 */
242	all_cpus = 1;
243	if (mp_ncpus == 0) {
244		/*
245		 * No CPUs were found, so this must be a UP system.  Setup
246		 * the variables to represent a system with a single CPU
247		 * with an id of 0.
248		 */
249		mp_ncpus = 1;
250		return (0);
251	}
252
253	/* At least one CPU was found. */
254	if (mp_ncpus == 1) {
255		/*
256		 * One CPU was found, so this must be a UP system with
257		 * an I/O APIC.
258		 */
259		return (0);
260	}
261
262	/* At least two CPUs were found. */
263	return (1);
264}
265
266/*
267 * Initialize the IPI handlers and start up the AP's.
268 */
269void
270cpu_mp_start(void)
271{
272	int i;
273
274	/* Initialize the logical ID to APIC ID table. */
275	for (i = 0; i < MAXCPU; i++) {
276		cpu_apic_ids[i] = -1;
277		cpu_ipi_pending[i] = 0;
278	}
279
280	/* Set boot_cpu_id if needed. */
281	if (boot_cpu_id == -1) {
282		boot_cpu_id = PCPU_GET(apic_id);
283		cpu_info[boot_cpu_id].cpu_bsp = 1;
284	} else
285		KASSERT(boot_cpu_id == PCPU_GET(apic_id),
286		    ("BSP's APIC ID doesn't match boot_cpu_id"));
287	cpu_apic_ids[0] = boot_cpu_id;
288	apic_cpuids[boot_cpu_id] = 0;
289
290	assign_cpu_ids();
291
292	/* Start each Application Processor */
293	start_all_aps();
294
295	/* Setup the initial logical CPUs info. */
296	logical_cpus = logical_cpus_mask = 0;
297	if (cpu_feature & CPUID_HTT)
298		logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
299
300	set_interrupt_apic_ids();
301}
302
303
304static void
305iv_rendezvous(uintptr_t a, uintptr_t b)
306{
307	smp_rendezvous_action();
308}
309
310static void
311iv_invltlb(uintptr_t a, uintptr_t b)
312{
313	xen_tlb_flush();
314}
315
316static void
317iv_invlpg(uintptr_t a, uintptr_t b)
318{
319	xen_invlpg(a);
320}
321
322static void
323iv_invlrng(uintptr_t a, uintptr_t b)
324{
325	vm_offset_t start = (vm_offset_t)a;
326	vm_offset_t end = (vm_offset_t)b;
327
328	while (start < end) {
329		xen_invlpg(start);
330		start += PAGE_SIZE;
331	}
332}
333
334
335static void
336iv_invlcache(uintptr_t a, uintptr_t b)
337{
338
339	wbinvd();
340	atomic_add_int(&smp_tlb_wait, 1);
341}
342
343static void
344iv_lazypmap(uintptr_t a, uintptr_t b)
345{
346	pmap_lazyfix_action();
347	atomic_add_int(&smp_tlb_wait, 1);
348}
349
350
351static void
352iv_noop(uintptr_t a, uintptr_t b)
353{
354	atomic_add_int(&smp_tlb_wait, 1);
355}
356
357static call_data_func_t *ipi_vectors[IPI_BITMAP_VECTOR] =
358{
359  iv_noop,
360  iv_noop,
361  iv_rendezvous,
362  iv_invltlb,
363  iv_invlpg,
364  iv_invlrng,
365  iv_invlcache,
366  iv_lazypmap,
367};
368
369/*
370 * Reschedule call back. Nothing to do,
371 * all the work is done automatically when
372 * we return from the interrupt.
373 */
374static int
375smp_reschedule_interrupt(void *unused)
376{
377	int cpu = PCPU_GET(cpuid);
378	u_int ipi_bitmap;
379
380	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
381
382	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
383#ifdef COUNT_IPIS
384		(*ipi_preempt_counts[cpu])++;
385#endif
386		sched_preempt(curthread);
387	}
388
389	if (ipi_bitmap & (1 << IPI_AST)) {
390#ifdef COUNT_IPIS
391		(*ipi_ast_counts[cpu])++;
392#endif
393		/* Nothing to do for AST */
394	}
395	return (FILTER_HANDLED);
396}
397
398struct _call_data {
399	uint16_t func_id;
400	uint16_t wait;
401	uintptr_t arg1;
402	uintptr_t arg2;
403	atomic_t started;
404	atomic_t finished;
405};
406
407static struct _call_data *call_data;
408
409static int
410smp_call_function_interrupt(void *unused)
411{
412	call_data_func_t *func;
413	uintptr_t arg1 = call_data->arg1;
414	uintptr_t arg2 = call_data->arg2;
415	int wait = call_data->wait;
416	atomic_t *started = &call_data->started;
417	atomic_t *finished = &call_data->finished;
418
419	if (call_data->func_id > IPI_BITMAP_VECTOR)
420		panic("invalid function id %u", call_data->func_id);
421
422	func = ipi_vectors[call_data->func_id];
423	/*
424	 * Notify initiating CPU that I've grabbed the data and am
425	 * about to execute the function
426	 */
427	mb();
428	atomic_inc(started);
429	/*
430	 * At this point the info structure may be out of scope unless wait==1
431	 */
432	(*func)(arg1, arg2);
433
434	if (wait) {
435		mb();
436		atomic_inc(finished);
437	}
438	atomic_add_int(&smp_tlb_wait, 1);
439	return (FILTER_HANDLED);
440}
441
442/*
443 * Print various information about the SMP system hardware and setup.
444 */
445void
446cpu_mp_announce(void)
447{
448	int i, x;
449
450	/* List CPUs */
451	printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
452	for (i = 1, x = 0; x <= MAX_APIC_ID; x++) {
453		if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp)
454			continue;
455		if (cpu_info[x].cpu_disabled)
456			printf("  cpu (AP): APIC ID: %2d (disabled)\n", x);
457		else {
458			KASSERT(i < mp_ncpus,
459			    ("mp_ncpus and actual cpus are out of whack"));
460			printf(" cpu%d (AP): APIC ID: %2d\n", i++, x);
461		}
462	}
463}
464
465static int
466xen_smp_intr_init(unsigned int cpu)
467{
468	int rc;
469	unsigned int irq;
470
471	per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
472
473	sprintf(resched_name[cpu], "resched%u", cpu);
474	rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
475				    cpu,
476				    resched_name[cpu],
477				    smp_reschedule_interrupt,
478	    INTR_FAST|INTR_TYPE_TTY|INTR_MPSAFE, &irq);
479
480	printf("cpu=%d irq=%d vector=%d\n",
481	    cpu, rc, RESCHEDULE_VECTOR);
482
483	per_cpu(resched_irq, cpu) = irq;
484
485	sprintf(callfunc_name[cpu], "callfunc%u", cpu);
486	rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR,
487				    cpu,
488				    callfunc_name[cpu],
489				    smp_call_function_interrupt,
490	    INTR_FAST|INTR_TYPE_TTY|INTR_MPSAFE, &irq);
491	if (rc < 0)
492		goto fail;
493	per_cpu(callfunc_irq, cpu) = irq;
494
495	printf("cpu=%d irq=%d vector=%d\n",
496	    cpu, rc, CALL_FUNCTION_VECTOR);
497
498
499	if ((cpu != 0) && ((rc = ap_cpu_initclocks(cpu)) != 0))
500		goto fail;
501
502	return 0;
503
504 fail:
505	if (per_cpu(resched_irq, cpu) >= 0)
506		unbind_from_irqhandler(per_cpu(resched_irq, cpu));
507	if (per_cpu(callfunc_irq, cpu) >= 0)
508		unbind_from_irqhandler(per_cpu(callfunc_irq, cpu));
509	return rc;
510}
511
512static void
513xen_smp_intr_init_cpus(void *unused)
514{
515	int i;
516
517	for (i = 0; i < mp_ncpus; i++)
518		xen_smp_intr_init(i);
519}
520
521#define MTOPSIZE (1<<(14 + PAGE_SHIFT))
522
523/*
524 * AP CPU's call this to initialize themselves.
525 */
526void
527init_secondary(void)
528{
529	vm_offset_t addr;
530	int	gsel_tss;
531
532
533	/* bootAP is set in start_ap() to our ID. */
534	PCPU_SET(currentldt, _default_ldt);
535	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
536#if 0
537	gdt[bootAP * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
538#endif
539	PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */
540	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
541	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
542#if 0
543	PCPU_SET(tss_gdt, &gdt[bootAP * NGDT + GPROC0_SEL].sd);
544
545	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
546#endif
547	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
548
549	/*
550	 * Set to a known state:
551	 * Set by mpboot.s: CR0_PG, CR0_PE
552	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
553	 */
554	/*
555	 * signal our startup to the BSP.
556	 */
557	mp_naps++;
558
559	/* Spin until the BSP releases the AP's. */
560	while (!aps_ready)
561		ia32_pause();
562
563	/* BSP may have changed PTD while we were waiting */
564	invltlb();
565	for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE)
566		invlpg(addr);
567
568	/* set up FPU state on the AP */
569	npxinit();
570#if 0
571
572	/* set up SSE registers */
573	enable_sse();
574#endif
575#if 0 && defined(PAE)
576	/* Enable the PTE no-execute bit. */
577	if ((amd_feature & AMDID_NX) != 0) {
578		uint64_t msr;
579
580		msr = rdmsr(MSR_EFER) | EFER_NXE;
581		wrmsr(MSR_EFER, msr);
582	}
583#endif
584#if 0
585	/* A quick check from sanity claus */
586	if (PCPU_GET(apic_id) != lapic_id()) {
587		printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
588		printf("SMP: actual apic_id = %d\n", lapic_id());
589		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
590		panic("cpuid mismatch! boom!!");
591	}
592#endif
593
594	/* Initialize curthread. */
595	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
596	PCPU_SET(curthread, PCPU_GET(idlethread));
597
598	mtx_lock_spin(&ap_boot_mtx);
599#if 0
600
601	/* Init local apic for irq's */
602	lapic_setup(1);
603#endif
604	smp_cpus++;
605
606	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid));
607	printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
608
609	/* Determine if we are a logical CPU. */
610	if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0)
611		logical_cpus_mask |= PCPU_GET(cpumask);
612
613	/* Determine if we are a hyperthread. */
614	if (hyperthreading_cpus > 1 &&
615	    PCPU_GET(apic_id) % hyperthreading_cpus != 0)
616		hyperthreading_cpus_mask |= PCPU_GET(cpumask);
617
618	/* Build our map of 'other' CPUs. */
619	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
620#if 0
621	if (bootverbose)
622		lapic_dump("AP");
623#endif
624	if (smp_cpus == mp_ncpus) {
625		/* enable IPI's, tlb shootdown, freezes etc */
626		atomic_store_rel_int(&smp_started, 1);
627		smp_active = 1;	 /* historic */
628	}
629
630	mtx_unlock_spin(&ap_boot_mtx);
631
632	/* wait until all the AP's are up */
633	while (smp_started == 0)
634		ia32_pause();
635
636
637	PCPU_SET(curthread, PCPU_GET(idlethread));
638	/* enter the scheduler */
639	sched_throw(NULL);
640
641	panic("scheduler returned us to %s", __func__);
642	/* NOTREACHED */
643}
644
645/*******************************************************************
646 * local functions and data
647 */
648
649/*
650 * We tell the I/O APIC code about all the CPUs we want to receive
651 * interrupts.  If we don't want certain CPUs to receive IRQs we
652 * can simply not tell the I/O APIC code about them in this function.
653 * We also do not tell it about the BSP since it tells itself about
654 * the BSP internally to work with UP kernels and on UP machines.
655 */
656static void
657set_interrupt_apic_ids(void)
658{
659	u_int i, apic_id;
660
661	for (i = 0; i < MAXCPU; i++) {
662		apic_id = cpu_apic_ids[i];
663		if (apic_id == -1)
664			continue;
665		if (cpu_info[apic_id].cpu_bsp)
666			continue;
667		if (cpu_info[apic_id].cpu_disabled)
668			continue;
669
670		/* Don't let hyperthreads service interrupts. */
671		if (hyperthreading_cpus > 1 &&
672		    apic_id % hyperthreading_cpus != 0)
673			continue;
674
675		intr_add_cpu(i);
676	}
677}
678
679/*
680 * Assign logical CPU IDs to local APICs.
681 */
682static void
683assign_cpu_ids(void)
684{
685	u_int i;
686
687	/* Check for explicitly disabled CPUs. */
688	for (i = 0; i <= MAX_APIC_ID; i++) {
689		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp)
690			continue;
691
692		/* Don't use this CPU if it has been disabled by a tunable. */
693		if (resource_disabled("lapic", i)) {
694			cpu_info[i].cpu_disabled = 1;
695			continue;
696		}
697	}
698
699	/*
700	 * Assign CPU IDs to local APIC IDs and disable any CPUs
701	 * beyond MAXCPU.  CPU 0 has already been assigned to the BSP,
702	 * so we only have to assign IDs for APs.
703	 */
704	mp_ncpus = 1;
705	for (i = 0; i <= MAX_APIC_ID; i++) {
706		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp ||
707		    cpu_info[i].cpu_disabled)
708			continue;
709
710		if (mp_ncpus < MAXCPU) {
711			cpu_apic_ids[mp_ncpus] = i;
712			apic_cpuids[i] = mp_ncpus;
713			mp_ncpus++;
714		} else
715			cpu_info[i].cpu_disabled = 1;
716	}
717	KASSERT(mp_maxid >= mp_ncpus - 1,
718	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
719	    mp_ncpus));
720}
721
722/*
723 * start each AP in our list
724 */
725/* Lowest 1MB is already mapped: don't touch*/
726#define TMPMAP_START 1
727int
728start_all_aps(void)
729{
730	int x,apic_id, cpu;
731	struct pcpu *pc;
732
733	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
734
735	/* set up temporary P==V mapping for AP boot */
736	/* XXX this is a hack, we should boot the AP on its own stack/PTD */
737
738	/* start each AP */
739	for (cpu = 1; cpu < mp_ncpus; cpu++) {
740		apic_id = cpu_apic_ids[cpu];
741
742
743		bootAP = cpu;
744		bootAPgdt = gdt + (512*cpu);
745
746		/* Get per-cpu data */
747		pc = &__pcpu[bootAP];
748		pcpu_init(pc, bootAP, sizeof(struct pcpu));
749		pc->pc_apic_id = cpu_apic_ids[bootAP];
750		pc->pc_prvspace = pc;
751		pc->pc_curthread = 0;
752
753		gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
754		gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
755
756		PT_SET_MA(bootAPgdt, xpmap_ptom(VTOP(bootAPgdt)) | PG_V | PG_RW);
757		bzero(bootAPgdt, PAGE_SIZE);
758		for (x = 0; x < NGDT; x++)
759			ssdtosd(&gdt_segs[x], &bootAPgdt[x].sd);
760		PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V);
761#ifdef notyet
762
763                if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) {
764                        apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id);
765                        acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id);
766#ifdef CONFIG_ACPI
767                        if (acpiid != 0xff)
768                                x86_acpiid_to_apicid[acpiid] = apicid;
769#endif
770                }
771#endif
772
773		/* attempt to start the Application Processor */
774		if (!start_ap(cpu)) {
775			printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id);
776			/* better panic as the AP may be running loose */
777			printf("panic y/n? [y] ");
778			if (cngetc() != 'n')
779				panic("bye-bye");
780		}
781
782		all_cpus |= (1 << cpu);		/* record AP in CPU map */
783	}
784
785
786	/* build our map of 'other' CPUs */
787	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
788
789	pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1);
790
791	/* number of APs actually started */
792	return mp_naps;
793}
794
795extern uint8_t *pcpu_boot_stack;
796extern trap_info_t trap_table[];
797
798static void
799smp_trap_init(trap_info_t *trap_ctxt)
800{
801        const trap_info_t *t = trap_table;
802
803        for (t = trap_table; t->address; t++) {
804                trap_ctxt[t->vector].flags = t->flags;
805                trap_ctxt[t->vector].cs = t->cs;
806                trap_ctxt[t->vector].address = t->address;
807        }
808}
809
810extern int nkpt;
811static void
812cpu_initialize_context(unsigned int cpu)
813{
814	/* vcpu_guest_context_t is too large to allocate on the stack.
815	 * Hence we allocate statically and protect it with a lock */
816	vm_page_t m[4];
817	static vcpu_guest_context_t ctxt;
818	vm_offset_t boot_stack;
819	vm_offset_t newPTD;
820	vm_paddr_t ma[NPGPTD];
821	static int color;
822	int i;
823
824	/*
825	 * Page 0,[0-3]	PTD
826	 * Page 1, [4]	boot stack
827	 * Page [5]	PDPT
828	 *
829	 */
830	for (i = 0; i < NPGPTD + 2; i++) {
831		m[i] = vm_page_alloc(NULL, color++,
832		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
833		    VM_ALLOC_ZERO);
834
835		pmap_zero_page(m[i]);
836
837	}
838	boot_stack = kmem_alloc_nofault(kernel_map, 1);
839	newPTD = kmem_alloc_nofault(kernel_map, NPGPTD);
840	ma[0] = xpmap_ptom(VM_PAGE_TO_PHYS(m[0]))|PG_V;
841
842#ifdef PAE
843	pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD + 1]));
844	for (i = 0; i < NPGPTD; i++) {
845		((vm_paddr_t *)boot_stack)[i] =
846		ma[i] =
847		    xpmap_ptom(VM_PAGE_TO_PHYS(m[i]))|PG_V;
848	}
849#endif
850
851	/*
852	 * Copy cpu0 IdlePTD to new IdlePTD - copying only
853	 * kernel mappings
854	 */
855	pmap_qenter(newPTD, m, 4);
856
857	memcpy((uint8_t *)newPTD + KPTDI*sizeof(vm_paddr_t),
858	    (uint8_t *)PTOV(IdlePTD) + KPTDI*sizeof(vm_paddr_t),
859	    nkpt*sizeof(vm_paddr_t));
860
861	pmap_qremove(newPTD, 4);
862	kmem_free(kernel_map, newPTD, 4);
863	/*
864	 * map actual idle stack to boot_stack
865	 */
866	pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD]));
867
868
869	xen_pgdpt_pin(xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1])));
870	vm_page_lock_queues();
871	for (i = 0; i < 4; i++) {
872		int pdir = (PTDPTDI + i) / NPDEPG;
873		int curoffset = (PTDPTDI + i) % NPDEPG;
874
875		xen_queue_pt_update((vm_paddr_t)
876		    ((ma[pdir] & ~PG_V) + (curoffset*sizeof(vm_paddr_t))),
877		    ma[i]);
878	}
879	PT_UPDATES_FLUSH();
880	vm_page_unlock_queues();
881
882	memset(&ctxt, 0, sizeof(ctxt));
883	ctxt.flags = VGCF_IN_KERNEL;
884	ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL);
885	ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL);
886	ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL);
887	ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL);
888	ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL);
889	ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL);
890	ctxt.user_regs.eip = (unsigned long)init_secondary;
891	ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */
892
893	memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
894
895	smp_trap_init(ctxt.trap_ctxt);
896
897	ctxt.ldt_ents = 0;
898	ctxt.gdt_frames[0] = (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT);
899	ctxt.gdt_ents      = 512;
900
901#ifdef __i386__
902	ctxt.user_regs.esp = boot_stack + PAGE_SIZE;
903
904	ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL);
905	ctxt.kernel_sp = boot_stack + PAGE_SIZE;
906
907	ctxt.event_callback_cs     = GSEL(GCODE_SEL, SEL_KPL);
908	ctxt.event_callback_eip    = (unsigned long)Xhypervisor_callback;
909	ctxt.failsafe_callback_cs  = GSEL(GCODE_SEL, SEL_KPL);
910	ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
911
912	ctxt.ctrlreg[3] = xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1]));
913#else /* __x86_64__ */
914	ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
915	ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL);
916	ctxt.kernel_sp = idle->thread.rsp0;
917
918	ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
919	ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
920	ctxt.syscall_callback_eip  = (unsigned long)system_call;
921
922	ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
923
924	ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu));
925#endif
926
927	printf("gdtpfn=%lx pdptpfn=%lx\n",
928	    ctxt.gdt_frames[0],
929	    ctxt.ctrlreg[3] >> PAGE_SHIFT);
930
931	PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt));
932	DELAY(3000);
933	PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL));
934}
935
936/*
937 * This function starts the AP (application processor) identified
938 * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
939 * to accomplish this.  This is necessary because of the nuances
940 * of the different hardware we might encounter.  It isn't pretty,
941 * but it seems to work.
942 */
943
944int cpus;
945static int
946start_ap(int apic_id)
947{
948	int ms;
949
950	/* used as a watchpoint to signal AP startup */
951	cpus = mp_naps;
952
953	cpu_initialize_context(apic_id);
954
955	/* Wait up to 5 seconds for it to start. */
956	for (ms = 0; ms < 5000; ms++) {
957		if (mp_naps > cpus)
958			return 1;	/* return SUCCESS */
959		DELAY(1000);
960	}
961	return 0;		/* return FAILURE */
962}
963
964/*
965 * Flush the TLB on all other CPU's
966 */
967static void
968smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
969{
970	u_int ncpu;
971	struct _call_data data;
972
973	call_data = &data;
974
975	ncpu = mp_ncpus - 1;	/* does not shootdown self */
976	if (ncpu < 1)
977		return;		/* no other cpus */
978	if (!(read_eflags() & PSL_I))
979		panic("%s: interrupts disabled", __func__);
980	mtx_lock_spin(&smp_ipi_mtx);
981	call_data->func_id = vector;
982	call_data->arg1 = addr1;
983	call_data->arg2 = addr2;
984	atomic_store_rel_int(&smp_tlb_wait, 0);
985	ipi_all_but_self(vector);
986	while (smp_tlb_wait < ncpu)
987		ia32_pause();
988	call_data = NULL;
989	mtx_unlock_spin(&smp_ipi_mtx);
990}
991
992static void
993smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
994{
995	int ncpu, othercpus;
996	struct _call_data data;
997
998	othercpus = mp_ncpus - 1;
999	if (mask == (u_int)-1) {
1000		ncpu = othercpus;
1001		if (ncpu < 1)
1002			return;
1003	} else {
1004		mask &= ~PCPU_GET(cpumask);
1005		if (mask == 0)
1006			return;
1007		ncpu = bitcount32(mask);
1008		if (ncpu > othercpus) {
1009			/* XXX this should be a panic offence */
1010			printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
1011			    ncpu, othercpus);
1012			ncpu = othercpus;
1013		}
1014		/* XXX should be a panic, implied by mask == 0 above */
1015		if (ncpu < 1)
1016			return;
1017	}
1018	if (!(read_eflags() & PSL_I))
1019		panic("%s: interrupts disabled", __func__);
1020	mtx_lock_spin(&smp_ipi_mtx);
1021	call_data = &data;
1022	call_data->func_id = vector;
1023	call_data->arg1 = addr1;
1024	call_data->arg2 = addr2;
1025	atomic_store_rel_int(&smp_tlb_wait, 0);
1026	if (mask == (u_int)-1)
1027		ipi_all_but_self(vector);
1028	else
1029		ipi_selected(mask, vector);
1030	while (smp_tlb_wait < ncpu)
1031		ia32_pause();
1032	call_data = NULL;
1033	mtx_unlock_spin(&smp_ipi_mtx);
1034}
1035
1036void
1037smp_cache_flush(void)
1038{
1039
1040	if (smp_started)
1041		smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
1042}
1043
1044void
1045smp_invltlb(void)
1046{
1047
1048	if (smp_started) {
1049		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
1050	}
1051}
1052
1053void
1054smp_invlpg(vm_offset_t addr)
1055{
1056
1057	if (smp_started) {
1058		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
1059	}
1060}
1061
1062void
1063smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
1064{
1065
1066	if (smp_started) {
1067		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
1068	}
1069}
1070
1071void
1072smp_masked_invltlb(u_int mask)
1073{
1074
1075	if (smp_started) {
1076		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
1077	}
1078}
1079
1080void
1081smp_masked_invlpg(u_int mask, vm_offset_t addr)
1082{
1083
1084	if (smp_started) {
1085		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
1086	}
1087}
1088
1089void
1090smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2)
1091{
1092
1093	if (smp_started) {
1094		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
1095	}
1096}
1097
1098/*
1099 * send an IPI to a set of cpus.
1100 */
1101void
1102ipi_selected(uint32_t cpus, u_int ipi)
1103{
1104	int cpu;
1105	u_int bitmap = 0;
1106	u_int old_pending;
1107	u_int new_pending;
1108
1109	if (IPI_IS_BITMAPED(ipi)) {
1110		bitmap = 1 << ipi;
1111		ipi = IPI_BITMAP_VECTOR;
1112	}
1113
1114#ifdef STOP_NMI
1115	if (ipi == IPI_STOP && stop_cpus_with_nmi) {
1116		ipi_nmi_selected(cpus);
1117		return;
1118	}
1119#endif
1120	CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi);
1121	while ((cpu = ffs(cpus)) != 0) {
1122		cpu--;
1123		cpus &= ~(1 << cpu);
1124
1125		KASSERT(cpu_apic_ids[cpu] != -1,
1126		    ("IPI to non-existent CPU %d", cpu));
1127
1128		if (bitmap) {
1129			do {
1130				old_pending = cpu_ipi_pending[cpu];
1131				new_pending = old_pending | bitmap;
1132			} while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],old_pending, new_pending));
1133
1134			if (!old_pending)
1135				ipi_pcpu(cpu, RESCHEDULE_VECTOR);
1136			continue;
1137
1138		}
1139
1140		KASSERT(call_data != NULL, ("call_data not set"));
1141		ipi_pcpu(cpu, CALL_FUNCTION_VECTOR);
1142	}
1143}
1144
1145/*
1146 * send an IPI to all CPUs EXCEPT myself
1147 */
1148void
1149ipi_all_but_self(u_int ipi)
1150{
1151
1152	if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) {
1153		ipi_selected(PCPU_GET(other_cpus), ipi);
1154		return;
1155	}
1156	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1157	ipi_selected(PCPU_GET(other_cpus), ipi);
1158}
1159
1160#ifdef STOP_NMI
1161/*
1162 * send NMI IPI to selected CPUs
1163 */
1164
1165#define	BEFORE_SPIN	1000000
1166
1167void
1168ipi_nmi_selected(u_int32_t cpus)
1169{
1170	int cpu;
1171	register_t icrlo;
1172
1173	icrlo = APIC_DELMODE_NMI | APIC_DESTMODE_PHY | APIC_LEVEL_ASSERT
1174		| APIC_TRIGMOD_EDGE;
1175
1176	CTR2(KTR_SMP, "%s: cpus: %x nmi", __func__, cpus);
1177
1178	atomic_set_int(&ipi_nmi_pending, cpus);
1179
1180	while ((cpu = ffs(cpus)) != 0) {
1181		cpu--;
1182		cpus &= ~(1 << cpu);
1183
1184		KASSERT(cpu_apic_ids[cpu] != -1,
1185		    ("IPI NMI to non-existent CPU %d", cpu));
1186
1187		/* Wait for an earlier IPI to finish. */
1188		if (!lapic_ipi_wait(BEFORE_SPIN))
1189			panic("ipi_nmi_selected: previous IPI has not cleared");
1190
1191		lapic_ipi_raw(icrlo, cpu_apic_ids[cpu]);
1192	}
1193}
1194
1195int
1196ipi_nmi_handler(void)
1197{
1198	int cpumask = PCPU_GET(cpumask);
1199
1200	if (!(ipi_nmi_pending & cpumask))
1201		return 1;
1202
1203	atomic_clear_int(&ipi_nmi_pending, cpumask);
1204	cpustop_handler();
1205	return 0;
1206}
1207
1208#endif /* STOP_NMI */
1209
1210/*
1211 * Handle an IPI_STOP by saving our current context and spinning until we
1212 * are resumed.
1213 */
1214void
1215cpustop_handler(void)
1216{
1217	int cpu = PCPU_GET(cpuid);
1218	int cpumask = PCPU_GET(cpumask);
1219
1220	savectx(&stoppcbs[cpu]);
1221
1222	/* Indicate that we are stopped */
1223	atomic_set_int(&stopped_cpus, cpumask);
1224
1225	/* Wait for restart */
1226	while (!(started_cpus & cpumask))
1227	    ia32_pause();
1228
1229	atomic_clear_int(&started_cpus, cpumask);
1230	atomic_clear_int(&stopped_cpus, cpumask);
1231
1232	if (cpu == 0 && cpustop_restartfunc != NULL) {
1233		cpustop_restartfunc();
1234		cpustop_restartfunc = NULL;
1235	}
1236}
1237
1238/*
1239 * This is called once the rest of the system is up and running and we're
1240 * ready to let the AP's out of the pen.
1241 */
1242static void
1243release_aps(void *dummy __unused)
1244{
1245
1246	if (mp_ncpus == 1)
1247		return;
1248	atomic_store_rel_int(&aps_ready, 1);
1249	while (smp_started == 0)
1250		ia32_pause();
1251}
1252SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1253SYSINIT(start_ipis, SI_SUB_INTR, SI_ORDER_ANY, xen_smp_intr_init_cpus, NULL);
1254
1255