mp_machdep.c revision 191759
1/*-
2 * Copyright (c) 1996, by Steve Passe
3 * Copyright (c) 2008, by Kip Macy
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. The name of the developer may NOT be used to endorse or promote products
12 *    derived from this software without specific prior written permission.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/i386/xen/mp_machdep.c 191759 2009-05-02 22:22:00Z kmacy $");
29
30#include "opt_apic.h"
31#include "opt_cpu.h"
32#include "opt_kstack_pages.h"
33#include "opt_mp_watchdog.h"
34#include "opt_sched.h"
35#include "opt_smp.h"
36
37#if !defined(lint)
38#if !defined(SMP)
39#error How did you get here?
40#endif
41
42#ifndef DEV_APIC
43#error The apic device is required for SMP, add "device apic" to your config file.
44#endif
45#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT)
46#error SMP not supported with CPU_DISABLE_CMPXCHG
47#endif
48#endif /* not lint */
49
50#include <sys/param.h>
51#include <sys/systm.h>
52#include <sys/bus.h>
53#include <sys/cons.h>	/* cngetc() */
54#ifdef GPROF
55#include <sys/gmon.h>
56#endif
57#include <sys/kernel.h>
58#include <sys/ktr.h>
59#include <sys/lock.h>
60#include <sys/malloc.h>
61#include <sys/memrange.h>
62#include <sys/mutex.h>
63#include <sys/pcpu.h>
64#include <sys/proc.h>
65#include <sys/sched.h>
66#include <sys/smp.h>
67#include <sys/sysctl.h>
68
69#include <vm/vm.h>
70#include <vm/vm_param.h>
71#include <vm/pmap.h>
72#include <vm/vm_kern.h>
73#include <vm/vm_extern.h>
74#include <vm/vm_page.h>
75
76#include <machine/apicreg.h>
77#include <machine/md_var.h>
78#include <machine/mp_watchdog.h>
79#include <machine/pcb.h>
80#include <machine/psl.h>
81#include <machine/smp.h>
82#include <machine/specialreg.h>
83#include <machine/pcpu.h>
84
85
86
87#include <machine/xen/xen-os.h>
88#include <xen/evtchn.h>
89#include <xen/xen_intr.h>
90#include <xen/hypervisor.h>
91#include <xen/interface/vcpu.h>
92
93#define stop_cpus_with_nmi	0
94
95
96int	mp_naps;		/* # of Applications processors */
97int	boot_cpu_id = -1;	/* designated BSP */
98
99extern	struct pcpu __pcpu[];
100
101static int bootAP;
102static union descriptor *bootAPgdt;
103
104static char resched_name[NR_CPUS][15];
105static char callfunc_name[NR_CPUS][15];
106
107/* Free these after use */
108void *bootstacks[MAXCPU];
109
110/* Hotwire a 0->4MB V==P mapping */
111extern pt_entry_t *KPTphys;
112
113struct pcb stoppcbs[MAXCPU];
114
115/* Variables needed for SMP tlb shootdown. */
116vm_offset_t smp_tlb_addr1;
117vm_offset_t smp_tlb_addr2;
118volatile int smp_tlb_wait;
119
120typedef void call_data_func_t(uintptr_t , uintptr_t);
121
122static u_int logical_cpus;
123
124/* used to hold the AP's until we are ready to release them */
125static struct mtx ap_boot_mtx;
126
127/* Set to 1 once we're ready to let the APs out of the pen. */
128static volatile int aps_ready = 0;
129
130/*
131 * Store data from cpu_add() until later in the boot when we actually setup
132 * the APs.
133 */
134struct cpu_info {
135	int	cpu_present:1;
136	int	cpu_bsp:1;
137	int	cpu_disabled:1;
138} static cpu_info[MAX_APIC_ID + 1];
139int cpu_apic_ids[MAXCPU];
140int apic_cpuids[MAX_APIC_ID + 1];
141
142/* Holds pending bitmap based IPIs per CPU */
143static volatile u_int cpu_ipi_pending[MAXCPU];
144
145static int cpu_logical;
146static int cpu_cores;
147
148static void	assign_cpu_ids(void);
149static void	set_interrupt_apic_ids(void);
150int	start_all_aps(void);
151static int	start_ap(int apic_id);
152static void	release_aps(void *dummy);
153
154static u_int	hyperthreading_cpus;
155static cpumask_t	hyperthreading_cpus_mask;
156
157extern void Xhypervisor_callback(void);
158extern void failsafe_callback(void);
159extern void pmap_lazyfix_action(void);
160
161struct cpu_group *
162cpu_topo(void)
163{
164	if (cpu_cores == 0)
165		cpu_cores = 1;
166	if (cpu_logical == 0)
167		cpu_logical = 1;
168	if (mp_ncpus % (cpu_cores * cpu_logical) != 0) {
169		printf("WARNING: Non-uniform processors.\n");
170		printf("WARNING: Using suboptimal topology.\n");
171		return (smp_topo_none());
172	}
173	/*
174	 * No multi-core or hyper-threaded.
175	 */
176	if (cpu_logical * cpu_cores == 1)
177		return (smp_topo_none());
178	/*
179	 * Only HTT no multi-core.
180	 */
181	if (cpu_logical > 1 && cpu_cores == 1)
182		return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
183	/*
184	 * Only multi-core no HTT.
185	 */
186	if (cpu_cores > 1 && cpu_logical == 1)
187		return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0));
188	/*
189	 * Both HTT and multi-core.
190	 */
191	return (smp_topo_2level(CG_SHARE_NONE, cpu_cores,
192	    CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
193}
194
195/*
196 * Calculate usable address in base memory for AP trampoline code.
197 */
198u_int
199mp_bootaddress(u_int basemem)
200{
201
202	return (basemem);
203}
204
205void
206cpu_add(u_int apic_id, char boot_cpu)
207{
208
209	if (apic_id > MAX_APIC_ID) {
210		panic("SMP: APIC ID %d too high", apic_id);
211		return;
212	}
213	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
214	    apic_id));
215	cpu_info[apic_id].cpu_present = 1;
216	if (boot_cpu) {
217		KASSERT(boot_cpu_id == -1,
218		    ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
219		    boot_cpu_id));
220		boot_cpu_id = apic_id;
221		cpu_info[apic_id].cpu_bsp = 1;
222	}
223	if (mp_ncpus < MAXCPU)
224		mp_ncpus++;
225	if (bootverbose)
226		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
227		    "AP");
228}
229
230void
231cpu_mp_setmaxid(void)
232{
233
234	mp_maxid = MAXCPU - 1;
235}
236
237int
238cpu_mp_probe(void)
239{
240
241	/*
242	 * Always record BSP in CPU map so that the mbuf init code works
243	 * correctly.
244	 */
245	all_cpus = 1;
246	if (mp_ncpus == 0) {
247		/*
248		 * No CPUs were found, so this must be a UP system.  Setup
249		 * the variables to represent a system with a single CPU
250		 * with an id of 0.
251		 */
252		mp_ncpus = 1;
253		return (0);
254	}
255
256	/* At least one CPU was found. */
257	if (mp_ncpus == 1) {
258		/*
259		 * One CPU was found, so this must be a UP system with
260		 * an I/O APIC.
261		 */
262		return (0);
263	}
264
265	/* At least two CPUs were found. */
266	return (1);
267}
268
269/*
270 * Initialize the IPI handlers and start up the AP's.
271 */
272void
273cpu_mp_start(void)
274{
275	int i;
276
277	/* Initialize the logical ID to APIC ID table. */
278	for (i = 0; i < MAXCPU; i++) {
279		cpu_apic_ids[i] = -1;
280		cpu_ipi_pending[i] = 0;
281	}
282
283	/* Set boot_cpu_id if needed. */
284	if (boot_cpu_id == -1) {
285		boot_cpu_id = PCPU_GET(apic_id);
286		cpu_info[boot_cpu_id].cpu_bsp = 1;
287	} else
288		KASSERT(boot_cpu_id == PCPU_GET(apic_id),
289		    ("BSP's APIC ID doesn't match boot_cpu_id"));
290	cpu_apic_ids[0] = boot_cpu_id;
291	apic_cpuids[boot_cpu_id] = 0;
292
293	assign_cpu_ids();
294
295	/* Start each Application Processor */
296	start_all_aps();
297
298	/* Setup the initial logical CPUs info. */
299	logical_cpus = logical_cpus_mask = 0;
300	if (cpu_feature & CPUID_HTT)
301		logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
302
303	set_interrupt_apic_ids();
304}
305
306
307static void
308iv_rendezvous(uintptr_t a, uintptr_t b)
309{
310	smp_rendezvous_action();
311}
312
313static void
314iv_invltlb(uintptr_t a, uintptr_t b)
315{
316	xen_tlb_flush();
317}
318
319static void
320iv_invlpg(uintptr_t a, uintptr_t b)
321{
322	xen_invlpg(a);
323}
324
325static void
326iv_invlrng(uintptr_t a, uintptr_t b)
327{
328	vm_offset_t start = (vm_offset_t)a;
329	vm_offset_t end = (vm_offset_t)b;
330
331	while (start < end) {
332		xen_invlpg(start);
333		start += PAGE_SIZE;
334	}
335}
336
337
338static void
339iv_invlcache(uintptr_t a, uintptr_t b)
340{
341
342	wbinvd();
343	atomic_add_int(&smp_tlb_wait, 1);
344}
345
346static void
347iv_lazypmap(uintptr_t a, uintptr_t b)
348{
349	pmap_lazyfix_action();
350	atomic_add_int(&smp_tlb_wait, 1);
351}
352
353
354static void
355iv_noop(uintptr_t a, uintptr_t b)
356{
357	atomic_add_int(&smp_tlb_wait, 1);
358}
359
360static call_data_func_t *ipi_vectors[IPI_BITMAP_VECTOR] =
361{
362  iv_noop,
363  iv_noop,
364  iv_rendezvous,
365  iv_invltlb,
366  iv_invlpg,
367  iv_invlrng,
368  iv_invlcache,
369  iv_lazypmap,
370};
371
372/*
373 * Reschedule call back. Nothing to do,
374 * all the work is done automatically when
375 * we return from the interrupt.
376 */
377static int
378smp_reschedule_interrupt(void *unused)
379{
380	int cpu = PCPU_GET(cpuid);
381	u_int ipi_bitmap;
382
383	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
384
385	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
386#ifdef COUNT_IPIS
387		(*ipi_preempt_counts[cpu])++;
388#endif
389		sched_preempt(curthread);
390	}
391
392	if (ipi_bitmap & (1 << IPI_AST)) {
393#ifdef COUNT_IPIS
394		(*ipi_ast_counts[cpu])++;
395#endif
396		/* Nothing to do for AST */
397	}
398	return (FILTER_HANDLED);
399}
400
401struct _call_data {
402	uint16_t func_id;
403	uint16_t wait;
404	uintptr_t arg1;
405	uintptr_t arg2;
406	atomic_t started;
407	atomic_t finished;
408};
409
410static struct _call_data *call_data;
411
412static int
413smp_call_function_interrupt(void *unused)
414{
415	call_data_func_t *func;
416	uintptr_t arg1 = call_data->arg1;
417	uintptr_t arg2 = call_data->arg2;
418	int wait = call_data->wait;
419	atomic_t *started = &call_data->started;
420	atomic_t *finished = &call_data->finished;
421
422	if (call_data->func_id > IPI_BITMAP_VECTOR)
423		panic("invalid function id %u", call_data->func_id);
424
425	func = ipi_vectors[call_data->func_id];
426	/*
427	 * Notify initiating CPU that I've grabbed the data and am
428	 * about to execute the function
429	 */
430	mb();
431	atomic_inc(started);
432	/*
433	 * At this point the info structure may be out of scope unless wait==1
434	 */
435	(*func)(arg1, arg2);
436
437	if (wait) {
438		mb();
439		atomic_inc(finished);
440	}
441	atomic_add_int(&smp_tlb_wait, 1);
442	return (FILTER_HANDLED);
443}
444
445/*
446 * Print various information about the SMP system hardware and setup.
447 */
448void
449cpu_mp_announce(void)
450{
451	int i, x;
452
453	/* List CPUs */
454	printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
455	for (i = 1, x = 0; x <= MAX_APIC_ID; x++) {
456		if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp)
457			continue;
458		if (cpu_info[x].cpu_disabled)
459			printf("  cpu (AP): APIC ID: %2d (disabled)\n", x);
460		else {
461			KASSERT(i < mp_ncpus,
462			    ("mp_ncpus and actual cpus are out of whack"));
463			printf(" cpu%d (AP): APIC ID: %2d\n", i++, x);
464		}
465	}
466}
467
468static int
469xen_smp_intr_init(unsigned int cpu)
470{
471	int rc;
472	unsigned int irq;
473
474	per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
475
476	sprintf(resched_name[cpu], "resched%u", cpu);
477	rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
478				    cpu,
479				    resched_name[cpu],
480				    smp_reschedule_interrupt,
481	    INTR_FAST|INTR_TYPE_TTY|INTR_MPSAFE, &irq);
482
483	printf("cpu=%d irq=%d vector=%d\n",
484	    cpu, rc, RESCHEDULE_VECTOR);
485
486	per_cpu(resched_irq, cpu) = irq;
487
488	sprintf(callfunc_name[cpu], "callfunc%u", cpu);
489	rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR,
490				    cpu,
491				    callfunc_name[cpu],
492				    smp_call_function_interrupt,
493	    INTR_FAST|INTR_TYPE_TTY|INTR_MPSAFE, &irq);
494	if (rc < 0)
495		goto fail;
496	per_cpu(callfunc_irq, cpu) = irq;
497
498	printf("cpu=%d irq=%d vector=%d\n",
499	    cpu, rc, CALL_FUNCTION_VECTOR);
500
501
502	if ((cpu != 0) && ((rc = ap_cpu_initclocks(cpu)) != 0))
503		goto fail;
504
505	return 0;
506
507 fail:
508	if (per_cpu(resched_irq, cpu) >= 0)
509		unbind_from_irqhandler(per_cpu(resched_irq, cpu));
510	if (per_cpu(callfunc_irq, cpu) >= 0)
511		unbind_from_irqhandler(per_cpu(callfunc_irq, cpu));
512	return rc;
513}
514
515static void
516xen_smp_intr_init_cpus(void *unused)
517{
518	int i;
519
520	for (i = 0; i < mp_ncpus; i++)
521		xen_smp_intr_init(i);
522}
523
524#define MTOPSIZE (1<<(14 + PAGE_SHIFT))
525
526/*
527 * AP CPU's call this to initialize themselves.
528 */
529void
530init_secondary(void)
531{
532	vm_offset_t addr;
533	int	gsel_tss;
534
535
536	/* bootAP is set in start_ap() to our ID. */
537	PCPU_SET(currentldt, _default_ldt);
538	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
539#if 0
540	gdt[bootAP * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
541#endif
542	PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */
543	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
544	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
545#if 0
546	PCPU_SET(tss_gdt, &gdt[bootAP * NGDT + GPROC0_SEL].sd);
547
548	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
549#endif
550	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
551
552	/*
553	 * Set to a known state:
554	 * Set by mpboot.s: CR0_PG, CR0_PE
555	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
556	 */
557	/*
558	 * signal our startup to the BSP.
559	 */
560	mp_naps++;
561
562	/* Spin until the BSP releases the AP's. */
563	while (!aps_ready)
564		ia32_pause();
565
566	/* BSP may have changed PTD while we were waiting */
567	invltlb();
568	for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE)
569		invlpg(addr);
570
571	/* set up FPU state on the AP */
572	npxinit();
573#if 0
574
575	/* set up SSE registers */
576	enable_sse();
577#endif
578#if 0 && defined(PAE)
579	/* Enable the PTE no-execute bit. */
580	if ((amd_feature & AMDID_NX) != 0) {
581		uint64_t msr;
582
583		msr = rdmsr(MSR_EFER) | EFER_NXE;
584		wrmsr(MSR_EFER, msr);
585	}
586#endif
587#if 0
588	/* A quick check from sanity claus */
589	if (PCPU_GET(apic_id) != lapic_id()) {
590		printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
591		printf("SMP: actual apic_id = %d\n", lapic_id());
592		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
593		panic("cpuid mismatch! boom!!");
594	}
595#endif
596
597	/* Initialize curthread. */
598	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
599	PCPU_SET(curthread, PCPU_GET(idlethread));
600
601	mtx_lock_spin(&ap_boot_mtx);
602#if 0
603
604	/* Init local apic for irq's */
605	lapic_setup(1);
606#endif
607	smp_cpus++;
608
609	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid));
610	printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
611
612	/* Determine if we are a logical CPU. */
613	if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0)
614		logical_cpus_mask |= PCPU_GET(cpumask);
615
616	/* Determine if we are a hyperthread. */
617	if (hyperthreading_cpus > 1 &&
618	    PCPU_GET(apic_id) % hyperthreading_cpus != 0)
619		hyperthreading_cpus_mask |= PCPU_GET(cpumask);
620
621	/* Build our map of 'other' CPUs. */
622	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
623#if 0
624	if (bootverbose)
625		lapic_dump("AP");
626#endif
627	if (smp_cpus == mp_ncpus) {
628		/* enable IPI's, tlb shootdown, freezes etc */
629		atomic_store_rel_int(&smp_started, 1);
630		smp_active = 1;	 /* historic */
631	}
632
633	mtx_unlock_spin(&ap_boot_mtx);
634
635	/* wait until all the AP's are up */
636	while (smp_started == 0)
637		ia32_pause();
638
639
640	PCPU_SET(curthread, PCPU_GET(idlethread));
641	/* enter the scheduler */
642	sched_throw(NULL);
643
644	panic("scheduler returned us to %s", __func__);
645	/* NOTREACHED */
646}
647
648/*******************************************************************
649 * local functions and data
650 */
651
652/*
653 * We tell the I/O APIC code about all the CPUs we want to receive
654 * interrupts.  If we don't want certain CPUs to receive IRQs we
655 * can simply not tell the I/O APIC code about them in this function.
656 * We also do not tell it about the BSP since it tells itself about
657 * the BSP internally to work with UP kernels and on UP machines.
658 */
659static void
660set_interrupt_apic_ids(void)
661{
662	u_int i, apic_id;
663
664	for (i = 0; i < MAXCPU; i++) {
665		apic_id = cpu_apic_ids[i];
666		if (apic_id == -1)
667			continue;
668		if (cpu_info[apic_id].cpu_bsp)
669			continue;
670		if (cpu_info[apic_id].cpu_disabled)
671			continue;
672
673		/* Don't let hyperthreads service interrupts. */
674		if (hyperthreading_cpus > 1 &&
675		    apic_id % hyperthreading_cpus != 0)
676			continue;
677
678		intr_add_cpu(i);
679	}
680}
681
682/*
683 * Assign logical CPU IDs to local APICs.
684 */
685static void
686assign_cpu_ids(void)
687{
688	u_int i;
689
690	/* Check for explicitly disabled CPUs. */
691	for (i = 0; i <= MAX_APIC_ID; i++) {
692		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp)
693			continue;
694
695		/* Don't use this CPU if it has been disabled by a tunable. */
696		if (resource_disabled("lapic", i)) {
697			cpu_info[i].cpu_disabled = 1;
698			continue;
699		}
700	}
701
702	/*
703	 * Assign CPU IDs to local APIC IDs and disable any CPUs
704	 * beyond MAXCPU.  CPU 0 has already been assigned to the BSP,
705	 * so we only have to assign IDs for APs.
706	 */
707	mp_ncpus = 1;
708	for (i = 0; i <= MAX_APIC_ID; i++) {
709		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp ||
710		    cpu_info[i].cpu_disabled)
711			continue;
712
713		if (mp_ncpus < MAXCPU) {
714			cpu_apic_ids[mp_ncpus] = i;
715			apic_cpuids[i] = mp_ncpus;
716			mp_ncpus++;
717		} else
718			cpu_info[i].cpu_disabled = 1;
719	}
720	KASSERT(mp_maxid >= mp_ncpus - 1,
721	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
722	    mp_ncpus));
723}
724
725/*
726 * start each AP in our list
727 */
728/* Lowest 1MB is already mapped: don't touch*/
729#define TMPMAP_START 1
730int
731start_all_aps(void)
732{
733	int x,apic_id, cpu;
734	struct pcpu *pc;
735
736	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
737
738	/* set up temporary P==V mapping for AP boot */
739	/* XXX this is a hack, we should boot the AP on its own stack/PTD */
740
741	/* start each AP */
742	for (cpu = 1; cpu < mp_ncpus; cpu++) {
743		apic_id = cpu_apic_ids[cpu];
744
745
746		bootAP = cpu;
747		bootAPgdt = gdt + (512*cpu);
748
749		/* Get per-cpu data */
750		pc = &__pcpu[bootAP];
751		pcpu_init(pc, bootAP, sizeof(struct pcpu));
752		pc->pc_apic_id = cpu_apic_ids[bootAP];
753		pc->pc_prvspace = pc;
754		pc->pc_curthread = 0;
755
756		gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
757		gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
758
759		PT_SET_MA(bootAPgdt, xpmap_ptom(VTOP(bootAPgdt)) | PG_V | PG_RW);
760		bzero(bootAPgdt, PAGE_SIZE);
761		for (x = 0; x < NGDT; x++)
762			ssdtosd(&gdt_segs[x], &bootAPgdt[x].sd);
763		PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V);
764#ifdef notyet
765
766                if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) {
767                        apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id);
768                        acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id);
769#ifdef CONFIG_ACPI
770                        if (acpiid != 0xff)
771                                x86_acpiid_to_apicid[acpiid] = apicid;
772#endif
773                }
774#endif
775
776		/* attempt to start the Application Processor */
777		if (!start_ap(cpu)) {
778			printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id);
779			/* better panic as the AP may be running loose */
780			printf("panic y/n? [y] ");
781			if (cngetc() != 'n')
782				panic("bye-bye");
783		}
784
785		all_cpus |= (1 << cpu);		/* record AP in CPU map */
786	}
787
788
789	/* build our map of 'other' CPUs */
790	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
791
792	pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1);
793
794	/* number of APs actually started */
795	return mp_naps;
796}
797
798extern uint8_t *pcpu_boot_stack;
799extern trap_info_t trap_table[];
800
801static void
802smp_trap_init(trap_info_t *trap_ctxt)
803{
804        const trap_info_t *t = trap_table;
805
806        for (t = trap_table; t->address; t++) {
807                trap_ctxt[t->vector].flags = t->flags;
808                trap_ctxt[t->vector].cs = t->cs;
809                trap_ctxt[t->vector].address = t->address;
810        }
811}
812
813extern int nkpt;
814static void
815cpu_initialize_context(unsigned int cpu)
816{
817	/* vcpu_guest_context_t is too large to allocate on the stack.
818	 * Hence we allocate statically and protect it with a lock */
819	vm_page_t m[4];
820	static vcpu_guest_context_t ctxt;
821	vm_offset_t boot_stack;
822	vm_offset_t newPTD;
823	vm_paddr_t ma[NPGPTD];
824	static int color;
825	int i;
826
827	/*
828	 * Page 0,[0-3]	PTD
829	 * Page 1, [4]	boot stack
830	 * Page [5]	PDPT
831	 *
832	 */
833	for (i = 0; i < NPGPTD + 2; i++) {
834		m[i] = vm_page_alloc(NULL, color++,
835		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
836		    VM_ALLOC_ZERO);
837
838		pmap_zero_page(m[i]);
839
840	}
841	boot_stack = kmem_alloc_nofault(kernel_map, 1);
842	newPTD = kmem_alloc_nofault(kernel_map, NPGPTD);
843	ma[0] = xpmap_ptom(VM_PAGE_TO_PHYS(m[0]))|PG_V;
844
845#ifdef PAE
846	pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD + 1]));
847	for (i = 0; i < NPGPTD; i++) {
848		((vm_paddr_t *)boot_stack)[i] =
849		ma[i] =
850		    xpmap_ptom(VM_PAGE_TO_PHYS(m[i]))|PG_V;
851	}
852#endif
853
854	/*
855	 * Copy cpu0 IdlePTD to new IdlePTD - copying only
856	 * kernel mappings
857	 */
858	pmap_qenter(newPTD, m, 4);
859
860	memcpy((uint8_t *)newPTD + KPTDI*sizeof(vm_paddr_t),
861	    (uint8_t *)PTOV(IdlePTD) + KPTDI*sizeof(vm_paddr_t),
862	    nkpt*sizeof(vm_paddr_t));
863
864	pmap_qremove(newPTD, 4);
865	kmem_free(kernel_map, newPTD, 4);
866	/*
867	 * map actual idle stack to boot_stack
868	 */
869	pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD]));
870
871
872	xen_pgdpt_pin(xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1])));
873	vm_page_lock_queues();
874	for (i = 0; i < 4; i++) {
875		int pdir = (PTDPTDI + i) / NPDEPG;
876		int curoffset = (PTDPTDI + i) % NPDEPG;
877
878		xen_queue_pt_update((vm_paddr_t)
879		    ((ma[pdir] & ~PG_V) + (curoffset*sizeof(vm_paddr_t))),
880		    ma[i]);
881	}
882	PT_UPDATES_FLUSH();
883	vm_page_unlock_queues();
884
885	memset(&ctxt, 0, sizeof(ctxt));
886	ctxt.flags = VGCF_IN_KERNEL;
887	ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL);
888	ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL);
889	ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL);
890	ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL);
891	ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL);
892	ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL);
893	ctxt.user_regs.eip = (unsigned long)init_secondary;
894	ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */
895
896	memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
897
898	smp_trap_init(ctxt.trap_ctxt);
899
900	ctxt.ldt_ents = 0;
901	ctxt.gdt_frames[0] = (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT);
902	ctxt.gdt_ents      = 512;
903
904#ifdef __i386__
905	ctxt.user_regs.esp = boot_stack + PAGE_SIZE;
906
907	ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL);
908	ctxt.kernel_sp = boot_stack + PAGE_SIZE;
909
910	ctxt.event_callback_cs     = GSEL(GCODE_SEL, SEL_KPL);
911	ctxt.event_callback_eip    = (unsigned long)Xhypervisor_callback;
912	ctxt.failsafe_callback_cs  = GSEL(GCODE_SEL, SEL_KPL);
913	ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
914
915	ctxt.ctrlreg[3] = xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1]));
916#else /* __x86_64__ */
917	ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
918	ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL);
919	ctxt.kernel_sp = idle->thread.rsp0;
920
921	ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
922	ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
923	ctxt.syscall_callback_eip  = (unsigned long)system_call;
924
925	ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
926
927	ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu));
928#endif
929
930	printf("gdtpfn=%lx pdptpfn=%lx\n",
931	    ctxt.gdt_frames[0],
932	    ctxt.ctrlreg[3] >> PAGE_SHIFT);
933
934	PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt));
935	DELAY(3000);
936	PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL));
937}
938
939/*
940 * This function starts the AP (application processor) identified
941 * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
942 * to accomplish this.  This is necessary because of the nuances
943 * of the different hardware we might encounter.  It isn't pretty,
944 * but it seems to work.
945 */
946
947int cpus;
948static int
949start_ap(int apic_id)
950{
951	int ms;
952
953	/* used as a watchpoint to signal AP startup */
954	cpus = mp_naps;
955
956	cpu_initialize_context(apic_id);
957
958	/* Wait up to 5 seconds for it to start. */
959	for (ms = 0; ms < 5000; ms++) {
960		if (mp_naps > cpus)
961			return 1;	/* return SUCCESS */
962		DELAY(1000);
963	}
964	return 0;		/* return FAILURE */
965}
966
967/*
968 * Flush the TLB on all other CPU's
969 */
970static void
971smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
972{
973	u_int ncpu;
974	struct _call_data data;
975
976	call_data = &data;
977
978	ncpu = mp_ncpus - 1;	/* does not shootdown self */
979	if (ncpu < 1)
980		return;		/* no other cpus */
981	if (!(read_eflags() & PSL_I))
982		panic("%s: interrupts disabled", __func__);
983	mtx_lock_spin(&smp_ipi_mtx);
984	call_data->func_id = vector;
985	call_data->arg1 = addr1;
986	call_data->arg2 = addr2;
987	atomic_store_rel_int(&smp_tlb_wait, 0);
988	ipi_all_but_self(vector);
989	while (smp_tlb_wait < ncpu)
990		ia32_pause();
991	call_data = NULL;
992	mtx_unlock_spin(&smp_ipi_mtx);
993}
994
995static void
996smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
997{
998	int ncpu, othercpus;
999	struct _call_data data;
1000
1001	othercpus = mp_ncpus - 1;
1002	if (mask == (u_int)-1) {
1003		ncpu = othercpus;
1004		if (ncpu < 1)
1005			return;
1006	} else {
1007		mask &= ~PCPU_GET(cpumask);
1008		if (mask == 0)
1009			return;
1010		ncpu = bitcount32(mask);
1011		if (ncpu > othercpus) {
1012			/* XXX this should be a panic offence */
1013			printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
1014			    ncpu, othercpus);
1015			ncpu = othercpus;
1016		}
1017		/* XXX should be a panic, implied by mask == 0 above */
1018		if (ncpu < 1)
1019			return;
1020	}
1021	if (!(read_eflags() & PSL_I))
1022		panic("%s: interrupts disabled", __func__);
1023	mtx_lock_spin(&smp_ipi_mtx);
1024	call_data = &data;
1025	call_data->func_id = vector;
1026	call_data->arg1 = addr1;
1027	call_data->arg2 = addr2;
1028	atomic_store_rel_int(&smp_tlb_wait, 0);
1029	if (mask == (u_int)-1)
1030		ipi_all_but_self(vector);
1031	else
1032		ipi_selected(mask, vector);
1033	while (smp_tlb_wait < ncpu)
1034		ia32_pause();
1035	call_data = NULL;
1036	mtx_unlock_spin(&smp_ipi_mtx);
1037}
1038
1039void
1040smp_cache_flush(void)
1041{
1042
1043	if (smp_started)
1044		smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
1045}
1046
1047void
1048smp_invltlb(void)
1049{
1050
1051	if (smp_started) {
1052		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
1053	}
1054}
1055
1056void
1057smp_invlpg(vm_offset_t addr)
1058{
1059
1060	if (smp_started) {
1061		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
1062	}
1063}
1064
1065void
1066smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
1067{
1068
1069	if (smp_started) {
1070		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
1071	}
1072}
1073
1074void
1075smp_masked_invltlb(u_int mask)
1076{
1077
1078	if (smp_started) {
1079		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
1080	}
1081}
1082
1083void
1084smp_masked_invlpg(u_int mask, vm_offset_t addr)
1085{
1086
1087	if (smp_started) {
1088		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
1089	}
1090}
1091
1092void
1093smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2)
1094{
1095
1096	if (smp_started) {
1097		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
1098	}
1099}
1100
1101/*
1102 * send an IPI to a set of cpus.
1103 */
1104void
1105ipi_selected(uint32_t cpus, u_int ipi)
1106{
1107	int cpu;
1108	u_int bitmap = 0;
1109	u_int old_pending;
1110	u_int new_pending;
1111
1112	if (IPI_IS_BITMAPED(ipi)) {
1113		bitmap = 1 << ipi;
1114		ipi = IPI_BITMAP_VECTOR;
1115	}
1116
1117#ifdef STOP_NMI
1118	if (ipi == IPI_STOP && stop_cpus_with_nmi) {
1119		ipi_nmi_selected(cpus);
1120		return;
1121	}
1122#endif
1123	CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi);
1124	while ((cpu = ffs(cpus)) != 0) {
1125		cpu--;
1126		cpus &= ~(1 << cpu);
1127
1128		KASSERT(cpu_apic_ids[cpu] != -1,
1129		    ("IPI to non-existent CPU %d", cpu));
1130
1131		if (bitmap) {
1132			do {
1133				old_pending = cpu_ipi_pending[cpu];
1134				new_pending = old_pending | bitmap;
1135			} while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],old_pending, new_pending));
1136
1137			if (!old_pending)
1138				ipi_pcpu(cpu, RESCHEDULE_VECTOR);
1139			continue;
1140
1141		}
1142
1143		KASSERT(call_data != NULL, ("call_data not set"));
1144		ipi_pcpu(cpu, CALL_FUNCTION_VECTOR);
1145	}
1146}
1147
1148/*
1149 * send an IPI to all CPUs EXCEPT myself
1150 */
1151void
1152ipi_all_but_self(u_int ipi)
1153{
1154
1155	if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) {
1156		ipi_selected(PCPU_GET(other_cpus), ipi);
1157		return;
1158	}
1159	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1160	ipi_selected(PCPU_GET(other_cpus), ipi);
1161}
1162
1163#ifdef STOP_NMI
1164/*
1165 * send NMI IPI to selected CPUs
1166 */
1167
1168#define	BEFORE_SPIN	1000000
1169
1170void
1171ipi_nmi_selected(u_int32_t cpus)
1172{
1173	int cpu;
1174	register_t icrlo;
1175
1176	icrlo = APIC_DELMODE_NMI | APIC_DESTMODE_PHY | APIC_LEVEL_ASSERT
1177		| APIC_TRIGMOD_EDGE;
1178
1179	CTR2(KTR_SMP, "%s: cpus: %x nmi", __func__, cpus);
1180
1181	atomic_set_int(&ipi_nmi_pending, cpus);
1182
1183	while ((cpu = ffs(cpus)) != 0) {
1184		cpu--;
1185		cpus &= ~(1 << cpu);
1186
1187		KASSERT(cpu_apic_ids[cpu] != -1,
1188		    ("IPI NMI to non-existent CPU %d", cpu));
1189
1190		/* Wait for an earlier IPI to finish. */
1191		if (!lapic_ipi_wait(BEFORE_SPIN))
1192			panic("ipi_nmi_selected: previous IPI has not cleared");
1193
1194		lapic_ipi_raw(icrlo, cpu_apic_ids[cpu]);
1195	}
1196}
1197
1198int
1199ipi_nmi_handler(void)
1200{
1201	int cpumask = PCPU_GET(cpumask);
1202
1203	if (!(ipi_nmi_pending & cpumask))
1204		return 1;
1205
1206	atomic_clear_int(&ipi_nmi_pending, cpumask);
1207	cpustop_handler();
1208	return 0;
1209}
1210
1211#endif /* STOP_NMI */
1212
1213/*
1214 * Handle an IPI_STOP by saving our current context and spinning until we
1215 * are resumed.
1216 */
1217void
1218cpustop_handler(void)
1219{
1220	int cpu = PCPU_GET(cpuid);
1221	int cpumask = PCPU_GET(cpumask);
1222
1223	savectx(&stoppcbs[cpu]);
1224
1225	/* Indicate that we are stopped */
1226	atomic_set_int(&stopped_cpus, cpumask);
1227
1228	/* Wait for restart */
1229	while (!(started_cpus & cpumask))
1230	    ia32_pause();
1231
1232	atomic_clear_int(&started_cpus, cpumask);
1233	atomic_clear_int(&stopped_cpus, cpumask);
1234
1235	if (cpu == 0 && cpustop_restartfunc != NULL) {
1236		cpustop_restartfunc();
1237		cpustop_restartfunc = NULL;
1238	}
1239}
1240
1241/*
1242 * This is called once the rest of the system is up and running and we're
1243 * ready to let the AP's out of the pen.
1244 */
1245static void
1246release_aps(void *dummy __unused)
1247{
1248
1249	if (mp_ncpus == 1)
1250		return;
1251	atomic_store_rel_int(&aps_ready, 1);
1252	while (smp_started == 0)
1253		ia32_pause();
1254}
1255SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1256SYSINIT(start_ipis, SI_SUB_INTR, SI_ORDER_ANY, xen_smp_intr_init_cpus, NULL);
1257
1258