mp_machdep.c revision 204972
1205147Sedwin/*-
2205147Sedwin * Copyright (c) 1996, by Steve Passe
3205147Sedwin * Copyright (c) 2008, by Kip Macy
4205147Sedwin * All rights reserved.
5205147Sedwin *
6205147Sedwin * Redistribution and use in source and binary forms, with or without
7205147Sedwin * modification, are permitted provided that the following conditions
8205147Sedwin * are met:
9205147Sedwin * 1. Redistributions of source code must retain the above copyright
10205147Sedwin *    notice, this list of conditions and the following disclaimer.
11205147Sedwin * 2. The name of the developer may NOT be used to endorse or promote products
12205147Sedwin *    derived from this software without specific prior written permission.
13205147Sedwin *
14205147Sedwin * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15205147Sedwin * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16205147Sedwin * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17205147Sedwin * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/i386/xen/mp_machdep.c 204972 2010-03-10 19:50:52Z jhb $");
29
30#include "opt_apic.h"
31#include "opt_cpu.h"
32#include "opt_kstack_pages.h"
33#include "opt_mp_watchdog.h"
34#include "opt_pmap.h"
35#include "opt_sched.h"
36#include "opt_smp.h"
37
38#if !defined(lint)
39#if !defined(SMP)
40#error How did you get here?
41#endif
42
43#ifndef DEV_APIC
44#error The apic device is required for SMP, add "device apic" to your config file.
45#endif
46#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT)
47#error SMP not supported with CPU_DISABLE_CMPXCHG
48#endif
49#endif /* not lint */
50
51#include <sys/param.h>
52#include <sys/systm.h>
53#include <sys/bus.h>
54#include <sys/cons.h>	/* cngetc() */
55#ifdef GPROF
56#include <sys/gmon.h>
57#endif
58#include <sys/kernel.h>
59#include <sys/ktr.h>
60#include <sys/lock.h>
61#include <sys/malloc.h>
62#include <sys/memrange.h>
63#include <sys/mutex.h>
64#include <sys/pcpu.h>
65#include <sys/proc.h>
66#include <sys/sched.h>
67#include <sys/smp.h>
68#include <sys/sysctl.h>
69
70#include <vm/vm.h>
71#include <vm/vm_param.h>
72#include <vm/pmap.h>
73#include <vm/vm_kern.h>
74#include <vm/vm_extern.h>
75#include <vm/vm_page.h>
76
77#include <machine/apicreg.h>
78#include <machine/md_var.h>
79#include <machine/mp_watchdog.h>
80#include <machine/pcb.h>
81#include <machine/psl.h>
82#include <machine/smp.h>
83#include <machine/specialreg.h>
84#include <machine/pcpu.h>
85
86
87
88#include <machine/xen/xen-os.h>
89#include <xen/evtchn.h>
90#include <xen/xen_intr.h>
91#include <xen/hypervisor.h>
92#include <xen/interface/vcpu.h>
93
94
95int	mp_naps;		/* # of Applications processors */
96int	boot_cpu_id = -1;	/* designated BSP */
97
98extern	struct pcpu __pcpu[];
99
100static int bootAP;
101static union descriptor *bootAPgdt;
102
103static char resched_name[NR_CPUS][15];
104static char callfunc_name[NR_CPUS][15];
105
106/* Free these after use */
107void *bootstacks[MAXCPU];
108
109struct pcb stoppcbs[MAXCPU];
110
111/* Variables needed for SMP tlb shootdown. */
112vm_offset_t smp_tlb_addr1;
113vm_offset_t smp_tlb_addr2;
114volatile int smp_tlb_wait;
115
116typedef void call_data_func_t(uintptr_t , uintptr_t);
117
118static u_int logical_cpus;
119static volatile cpumask_t ipi_nmi_pending;
120
121/* used to hold the AP's until we are ready to release them */
122static struct mtx ap_boot_mtx;
123
124/* Set to 1 once we're ready to let the APs out of the pen. */
125static volatile int aps_ready = 0;
126
127/*
128 * Store data from cpu_add() until later in the boot when we actually setup
129 * the APs.
130 */
131struct cpu_info {
132	int	cpu_present:1;
133	int	cpu_bsp:1;
134	int	cpu_disabled:1;
135} static cpu_info[MAX_APIC_ID + 1];
136int cpu_apic_ids[MAXCPU];
137int apic_cpuids[MAX_APIC_ID + 1];
138
139/* Holds pending bitmap based IPIs per CPU */
140static volatile u_int cpu_ipi_pending[MAXCPU];
141
142static int cpu_logical;
143static int cpu_cores;
144
145static void	assign_cpu_ids(void);
146static void	set_interrupt_apic_ids(void);
147int	start_all_aps(void);
148static int	start_ap(int apic_id);
149static void	release_aps(void *dummy);
150
151static u_int	hyperthreading_cpus;
152static cpumask_t	hyperthreading_cpus_mask;
153
154extern void Xhypervisor_callback(void);
155extern void failsafe_callback(void);
156extern void pmap_lazyfix_action(void);
157
158struct cpu_group *
159cpu_topo(void)
160{
161	if (cpu_cores == 0)
162		cpu_cores = 1;
163	if (cpu_logical == 0)
164		cpu_logical = 1;
165	if (mp_ncpus % (cpu_cores * cpu_logical) != 0) {
166		printf("WARNING: Non-uniform processors.\n");
167		printf("WARNING: Using suboptimal topology.\n");
168		return (smp_topo_none());
169	}
170	/*
171	 * No multi-core or hyper-threaded.
172	 */
173	if (cpu_logical * cpu_cores == 1)
174		return (smp_topo_none());
175	/*
176	 * Only HTT no multi-core.
177	 */
178	if (cpu_logical > 1 && cpu_cores == 1)
179		return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
180	/*
181	 * Only multi-core no HTT.
182	 */
183	if (cpu_cores > 1 && cpu_logical == 1)
184		return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0));
185	/*
186	 * Both HTT and multi-core.
187	 */
188	return (smp_topo_2level(CG_SHARE_NONE, cpu_cores,
189	    CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
190}
191
192/*
193 * Calculate usable address in base memory for AP trampoline code.
194 */
195u_int
196mp_bootaddress(u_int basemem)
197{
198
199	return (basemem);
200}
201
202void
203cpu_add(u_int apic_id, char boot_cpu)
204{
205
206	if (apic_id > MAX_APIC_ID) {
207		panic("SMP: APIC ID %d too high", apic_id);
208		return;
209	}
210	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
211	    apic_id));
212	cpu_info[apic_id].cpu_present = 1;
213	if (boot_cpu) {
214		KASSERT(boot_cpu_id == -1,
215		    ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
216		    boot_cpu_id));
217		boot_cpu_id = apic_id;
218		cpu_info[apic_id].cpu_bsp = 1;
219	}
220	if (mp_ncpus < MAXCPU)
221		mp_ncpus++;
222	if (bootverbose)
223		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
224		    "AP");
225}
226
227void
228cpu_mp_setmaxid(void)
229{
230
231	mp_maxid = MAXCPU - 1;
232}
233
234int
235cpu_mp_probe(void)
236{
237
238	/*
239	 * Always record BSP in CPU map so that the mbuf init code works
240	 * correctly.
241	 */
242	all_cpus = 1;
243	if (mp_ncpus == 0) {
244		/*
245		 * No CPUs were found, so this must be a UP system.  Setup
246		 * the variables to represent a system with a single CPU
247		 * with an id of 0.
248		 */
249		mp_ncpus = 1;
250		return (0);
251	}
252
253	/* At least one CPU was found. */
254	if (mp_ncpus == 1) {
255		/*
256		 * One CPU was found, so this must be a UP system with
257		 * an I/O APIC.
258		 */
259		return (0);
260	}
261
262	/* At least two CPUs were found. */
263	return (1);
264}
265
266/*
267 * Initialize the IPI handlers and start up the AP's.
268 */
269void
270cpu_mp_start(void)
271{
272	int i;
273
274	/* Initialize the logical ID to APIC ID table. */
275	for (i = 0; i < MAXCPU; i++) {
276		cpu_apic_ids[i] = -1;
277		cpu_ipi_pending[i] = 0;
278	}
279
280	/* Set boot_cpu_id if needed. */
281	if (boot_cpu_id == -1) {
282		boot_cpu_id = PCPU_GET(apic_id);
283		cpu_info[boot_cpu_id].cpu_bsp = 1;
284	} else
285		KASSERT(boot_cpu_id == PCPU_GET(apic_id),
286		    ("BSP's APIC ID doesn't match boot_cpu_id"));
287	cpu_apic_ids[0] = boot_cpu_id;
288	apic_cpuids[boot_cpu_id] = 0;
289
290	assign_cpu_ids();
291
292	/* Start each Application Processor */
293	start_all_aps();
294
295	/* Setup the initial logical CPUs info. */
296	logical_cpus = logical_cpus_mask = 0;
297	if (cpu_feature & CPUID_HTT)
298		logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
299
300	set_interrupt_apic_ids();
301}
302
303
304static void
305iv_rendezvous(uintptr_t a, uintptr_t b)
306{
307	smp_rendezvous_action();
308}
309
310static void
311iv_invltlb(uintptr_t a, uintptr_t b)
312{
313	xen_tlb_flush();
314}
315
316static void
317iv_invlpg(uintptr_t a, uintptr_t b)
318{
319	xen_invlpg(a);
320}
321
322static void
323iv_invlrng(uintptr_t a, uintptr_t b)
324{
325	vm_offset_t start = (vm_offset_t)a;
326	vm_offset_t end = (vm_offset_t)b;
327
328	while (start < end) {
329		xen_invlpg(start);
330		start += PAGE_SIZE;
331	}
332}
333
334
335static void
336iv_invlcache(uintptr_t a, uintptr_t b)
337{
338
339	wbinvd();
340	atomic_add_int(&smp_tlb_wait, 1);
341}
342
343static void
344iv_lazypmap(uintptr_t a, uintptr_t b)
345{
346	pmap_lazyfix_action();
347	atomic_add_int(&smp_tlb_wait, 1);
348}
349
350/*
351 * These start from "IPI offset" APIC_IPI_INTS
352 */
353static call_data_func_t *ipi_vectors[6] =
354{
355  iv_rendezvous,
356  iv_invltlb,
357  iv_invlpg,
358  iv_invlrng,
359  iv_invlcache,
360  iv_lazypmap,
361};
362
363/*
364 * Reschedule call back. Nothing to do,
365 * all the work is done automatically when
366 * we return from the interrupt.
367 */
368static int
369smp_reschedule_interrupt(void *unused)
370{
371	int cpu = PCPU_GET(cpuid);
372	u_int ipi_bitmap;
373
374	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
375
376	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
377#ifdef COUNT_IPIS
378		(*ipi_preempt_counts[cpu])++;
379#endif
380		sched_preempt(curthread);
381	}
382
383	if (ipi_bitmap & (1 << IPI_AST)) {
384#ifdef COUNT_IPIS
385		(*ipi_ast_counts[cpu])++;
386#endif
387		/* Nothing to do for AST */
388	}
389	return (FILTER_HANDLED);
390}
391
392struct _call_data {
393	uint16_t func_id;
394	uint16_t wait;
395	uintptr_t arg1;
396	uintptr_t arg2;
397	atomic_t started;
398	atomic_t finished;
399};
400
401static struct _call_data *call_data;
402
403static int
404smp_call_function_interrupt(void *unused)
405{
406	call_data_func_t *func;
407	uintptr_t arg1 = call_data->arg1;
408	uintptr_t arg2 = call_data->arg2;
409	int wait = call_data->wait;
410	atomic_t *started = &call_data->started;
411	atomic_t *finished = &call_data->finished;
412
413	/* We only handle function IPIs, not bitmap IPIs */
414	if (call_data->func_id < APIC_IPI_INTS || call_data->func_id > IPI_BITMAP_VECTOR)
415		panic("invalid function id %u", call_data->func_id);
416
417	func = ipi_vectors[call_data->func_id - APIC_IPI_INTS];
418	/*
419	 * Notify initiating CPU that I've grabbed the data and am
420	 * about to execute the function
421	 */
422	mb();
423	atomic_inc(started);
424	/*
425	 * At this point the info structure may be out of scope unless wait==1
426	 */
427	(*func)(arg1, arg2);
428
429	if (wait) {
430		mb();
431		atomic_inc(finished);
432	}
433	atomic_add_int(&smp_tlb_wait, 1);
434	return (FILTER_HANDLED);
435}
436
437/*
438 * Print various information about the SMP system hardware and setup.
439 */
440void
441cpu_mp_announce(void)
442{
443	int i, x;
444
445	/* List CPUs */
446	printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
447	for (i = 1, x = 0; x <= MAX_APIC_ID; x++) {
448		if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp)
449			continue;
450		if (cpu_info[x].cpu_disabled)
451			printf("  cpu (AP): APIC ID: %2d (disabled)\n", x);
452		else {
453			KASSERT(i < mp_ncpus,
454			    ("mp_ncpus and actual cpus are out of whack"));
455			printf(" cpu%d (AP): APIC ID: %2d\n", i++, x);
456		}
457	}
458}
459
460static int
461xen_smp_intr_init(unsigned int cpu)
462{
463	int rc;
464	unsigned int irq;
465
466	per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
467
468	sprintf(resched_name[cpu], "resched%u", cpu);
469	rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
470				    cpu,
471				    resched_name[cpu],
472				    smp_reschedule_interrupt,
473	    INTR_FAST|INTR_TYPE_TTY|INTR_MPSAFE, &irq);
474
475	printf("[XEN] IPI cpu=%d irq=%d vector=RESCHEDULE_VECTOR (%d)\n",
476	    cpu, irq, RESCHEDULE_VECTOR);
477
478	per_cpu(resched_irq, cpu) = irq;
479
480	sprintf(callfunc_name[cpu], "callfunc%u", cpu);
481	rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR,
482				    cpu,
483				    callfunc_name[cpu],
484				    smp_call_function_interrupt,
485	    INTR_FAST|INTR_TYPE_TTY|INTR_MPSAFE, &irq);
486	if (rc < 0)
487		goto fail;
488	per_cpu(callfunc_irq, cpu) = irq;
489
490	printf("[XEN] IPI cpu=%d irq=%d vector=CALL_FUNCTION_VECTOR (%d)\n",
491	    cpu, irq, CALL_FUNCTION_VECTOR);
492
493
494	if ((cpu != 0) && ((rc = ap_cpu_initclocks(cpu)) != 0))
495		goto fail;
496
497	return 0;
498
499 fail:
500	if (per_cpu(resched_irq, cpu) >= 0)
501		unbind_from_irqhandler(per_cpu(resched_irq, cpu));
502	if (per_cpu(callfunc_irq, cpu) >= 0)
503		unbind_from_irqhandler(per_cpu(callfunc_irq, cpu));
504	return rc;
505}
506
507static void
508xen_smp_intr_init_cpus(void *unused)
509{
510	int i;
511
512	for (i = 0; i < mp_ncpus; i++)
513		xen_smp_intr_init(i);
514}
515
516#define MTOPSIZE (1<<(14 + PAGE_SHIFT))
517
518/*
519 * AP CPU's call this to initialize themselves.
520 */
521void
522init_secondary(void)
523{
524	vm_offset_t addr;
525	int	gsel_tss;
526
527
528	/* bootAP is set in start_ap() to our ID. */
529	PCPU_SET(currentldt, _default_ldt);
530	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
531#if 0
532	gdt[bootAP * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
533#endif
534	PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */
535	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
536	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
537#if 0
538	PCPU_SET(tss_gdt, &gdt[bootAP * NGDT + GPROC0_SEL].sd);
539
540	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
541#endif
542	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
543
544	/*
545	 * Set to a known state:
546	 * Set by mpboot.s: CR0_PG, CR0_PE
547	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
548	 */
549	/*
550	 * signal our startup to the BSP.
551	 */
552	mp_naps++;
553
554	/* Spin until the BSP releases the AP's. */
555	while (!aps_ready)
556		ia32_pause();
557
558	/* BSP may have changed PTD while we were waiting */
559	invltlb();
560	for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE)
561		invlpg(addr);
562
563	/* set up FPU state on the AP */
564	npxinit();
565#if 0
566
567	/* set up SSE registers */
568	enable_sse();
569#endif
570#if 0 && defined(PAE)
571	/* Enable the PTE no-execute bit. */
572	if ((amd_feature & AMDID_NX) != 0) {
573		uint64_t msr;
574
575		msr = rdmsr(MSR_EFER) | EFER_NXE;
576		wrmsr(MSR_EFER, msr);
577	}
578#endif
579#if 0
580	/* A quick check from sanity claus */
581	if (PCPU_GET(apic_id) != lapic_id()) {
582		printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
583		printf("SMP: actual apic_id = %d\n", lapic_id());
584		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
585		panic("cpuid mismatch! boom!!");
586	}
587#endif
588
589	/* Initialize curthread. */
590	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
591	PCPU_SET(curthread, PCPU_GET(idlethread));
592
593	mtx_lock_spin(&ap_boot_mtx);
594#if 0
595
596	/* Init local apic for irq's */
597	lapic_setup(1);
598#endif
599	smp_cpus++;
600
601	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid));
602	printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
603
604	/* Determine if we are a logical CPU. */
605	if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0)
606		logical_cpus_mask |= PCPU_GET(cpumask);
607
608	/* Determine if we are a hyperthread. */
609	if (hyperthreading_cpus > 1 &&
610	    PCPU_GET(apic_id) % hyperthreading_cpus != 0)
611		hyperthreading_cpus_mask |= PCPU_GET(cpumask);
612
613	/* Build our map of 'other' CPUs. */
614	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
615#if 0
616	if (bootverbose)
617		lapic_dump("AP");
618#endif
619	if (smp_cpus == mp_ncpus) {
620		/* enable IPI's, tlb shootdown, freezes etc */
621		atomic_store_rel_int(&smp_started, 1);
622		smp_active = 1;	 /* historic */
623	}
624
625	mtx_unlock_spin(&ap_boot_mtx);
626
627	/* wait until all the AP's are up */
628	while (smp_started == 0)
629		ia32_pause();
630
631
632	PCPU_SET(curthread, PCPU_GET(idlethread));
633	/* enter the scheduler */
634	sched_throw(NULL);
635
636	panic("scheduler returned us to %s", __func__);
637	/* NOTREACHED */
638}
639
640/*******************************************************************
641 * local functions and data
642 */
643
644/*
645 * We tell the I/O APIC code about all the CPUs we want to receive
646 * interrupts.  If we don't want certain CPUs to receive IRQs we
647 * can simply not tell the I/O APIC code about them in this function.
648 * We also do not tell it about the BSP since it tells itself about
649 * the BSP internally to work with UP kernels and on UP machines.
650 */
651static void
652set_interrupt_apic_ids(void)
653{
654	u_int i, apic_id;
655
656	for (i = 0; i < MAXCPU; i++) {
657		apic_id = cpu_apic_ids[i];
658		if (apic_id == -1)
659			continue;
660		if (cpu_info[apic_id].cpu_bsp)
661			continue;
662		if (cpu_info[apic_id].cpu_disabled)
663			continue;
664
665		/* Don't let hyperthreads service interrupts. */
666		if (hyperthreading_cpus > 1 &&
667		    apic_id % hyperthreading_cpus != 0)
668			continue;
669
670		intr_add_cpu(i);
671	}
672}
673
674/*
675 * Assign logical CPU IDs to local APICs.
676 */
677static void
678assign_cpu_ids(void)
679{
680	u_int i;
681
682	/* Check for explicitly disabled CPUs. */
683	for (i = 0; i <= MAX_APIC_ID; i++) {
684		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp)
685			continue;
686
687		/* Don't use this CPU if it has been disabled by a tunable. */
688		if (resource_disabled("lapic", i)) {
689			cpu_info[i].cpu_disabled = 1;
690			continue;
691		}
692	}
693
694	/*
695	 * Assign CPU IDs to local APIC IDs and disable any CPUs
696	 * beyond MAXCPU.  CPU 0 has already been assigned to the BSP,
697	 * so we only have to assign IDs for APs.
698	 */
699	mp_ncpus = 1;
700	for (i = 0; i <= MAX_APIC_ID; i++) {
701		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp ||
702		    cpu_info[i].cpu_disabled)
703			continue;
704
705		if (mp_ncpus < MAXCPU) {
706			cpu_apic_ids[mp_ncpus] = i;
707			apic_cpuids[i] = mp_ncpus;
708			mp_ncpus++;
709		} else
710			cpu_info[i].cpu_disabled = 1;
711	}
712	KASSERT(mp_maxid >= mp_ncpus - 1,
713	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
714	    mp_ncpus));
715}
716
717/*
718 * start each AP in our list
719 */
720/* Lowest 1MB is already mapped: don't touch*/
721#define TMPMAP_START 1
722int
723start_all_aps(void)
724{
725	int x,apic_id, cpu;
726	struct pcpu *pc;
727
728	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
729
730	/* set up temporary P==V mapping for AP boot */
731	/* XXX this is a hack, we should boot the AP on its own stack/PTD */
732
733	/* start each AP */
734	for (cpu = 1; cpu < mp_ncpus; cpu++) {
735		apic_id = cpu_apic_ids[cpu];
736
737
738		bootAP = cpu;
739		bootAPgdt = gdt + (512*cpu);
740
741		/* Get per-cpu data */
742		pc = &__pcpu[bootAP];
743		pcpu_init(pc, bootAP, sizeof(struct pcpu));
744		dpcpu_init((void *)kmem_alloc(kernel_map, DPCPU_SIZE), bootAP);
745		pc->pc_apic_id = cpu_apic_ids[bootAP];
746		pc->pc_prvspace = pc;
747		pc->pc_curthread = 0;
748
749		gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
750		gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
751
752		PT_SET_MA(bootAPgdt, xpmap_ptom(VTOP(bootAPgdt)) | PG_V | PG_RW);
753		bzero(bootAPgdt, PAGE_SIZE);
754		for (x = 0; x < NGDT; x++)
755			ssdtosd(&gdt_segs[x], &bootAPgdt[x].sd);
756		PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V);
757#ifdef notyet
758
759                if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) {
760                        apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id);
761                        acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id);
762#ifdef CONFIG_ACPI
763                        if (acpiid != 0xff)
764                                x86_acpiid_to_apicid[acpiid] = apicid;
765#endif
766                }
767#endif
768
769		/* attempt to start the Application Processor */
770		if (!start_ap(cpu)) {
771			printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id);
772			/* better panic as the AP may be running loose */
773			printf("panic y/n? [y] ");
774			if (cngetc() != 'n')
775				panic("bye-bye");
776		}
777
778		all_cpus |= (1 << cpu);		/* record AP in CPU map */
779	}
780
781
782	/* build our map of 'other' CPUs */
783	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
784
785	pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1);
786
787	/* number of APs actually started */
788	return mp_naps;
789}
790
791extern uint8_t *pcpu_boot_stack;
792extern trap_info_t trap_table[];
793
794static void
795smp_trap_init(trap_info_t *trap_ctxt)
796{
797        const trap_info_t *t = trap_table;
798
799        for (t = trap_table; t->address; t++) {
800                trap_ctxt[t->vector].flags = t->flags;
801                trap_ctxt[t->vector].cs = t->cs;
802                trap_ctxt[t->vector].address = t->address;
803        }
804}
805
806extern int nkpt;
807static void
808cpu_initialize_context(unsigned int cpu)
809{
810	/* vcpu_guest_context_t is too large to allocate on the stack.
811	 * Hence we allocate statically and protect it with a lock */
812	vm_page_t m[4];
813	static vcpu_guest_context_t ctxt;
814	vm_offset_t boot_stack;
815	vm_offset_t newPTD;
816	vm_paddr_t ma[NPGPTD];
817	static int color;
818	int i;
819
820	/*
821	 * Page 0,[0-3]	PTD
822	 * Page 1, [4]	boot stack
823	 * Page [5]	PDPT
824	 *
825	 */
826	for (i = 0; i < NPGPTD + 2; i++) {
827		m[i] = vm_page_alloc(NULL, color++,
828		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
829		    VM_ALLOC_ZERO);
830
831		pmap_zero_page(m[i]);
832
833	}
834	boot_stack = kmem_alloc_nofault(kernel_map, 1);
835	newPTD = kmem_alloc_nofault(kernel_map, NPGPTD);
836	ma[0] = xpmap_ptom(VM_PAGE_TO_PHYS(m[0]))|PG_V;
837
838#ifdef PAE
839	pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD + 1]));
840	for (i = 0; i < NPGPTD; i++) {
841		((vm_paddr_t *)boot_stack)[i] =
842		ma[i] =
843		    xpmap_ptom(VM_PAGE_TO_PHYS(m[i]))|PG_V;
844	}
845#endif
846
847	/*
848	 * Copy cpu0 IdlePTD to new IdlePTD - copying only
849	 * kernel mappings
850	 */
851	pmap_qenter(newPTD, m, 4);
852
853	memcpy((uint8_t *)newPTD + KPTDI*sizeof(vm_paddr_t),
854	    (uint8_t *)PTOV(IdlePTD) + KPTDI*sizeof(vm_paddr_t),
855	    nkpt*sizeof(vm_paddr_t));
856
857	pmap_qremove(newPTD, 4);
858	kmem_free(kernel_map, newPTD, 4);
859	/*
860	 * map actual idle stack to boot_stack
861	 */
862	pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD]));
863
864
865	xen_pgdpt_pin(xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1])));
866	vm_page_lock_queues();
867	for (i = 0; i < 4; i++) {
868		int pdir = (PTDPTDI + i) / NPDEPG;
869		int curoffset = (PTDPTDI + i) % NPDEPG;
870
871		xen_queue_pt_update((vm_paddr_t)
872		    ((ma[pdir] & ~PG_V) + (curoffset*sizeof(vm_paddr_t))),
873		    ma[i]);
874	}
875	PT_UPDATES_FLUSH();
876	vm_page_unlock_queues();
877
878	memset(&ctxt, 0, sizeof(ctxt));
879	ctxt.flags = VGCF_IN_KERNEL;
880	ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL);
881	ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL);
882	ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL);
883	ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL);
884	ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL);
885	ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL);
886	ctxt.user_regs.eip = (unsigned long)init_secondary;
887	ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */
888
889	memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
890
891	smp_trap_init(ctxt.trap_ctxt);
892
893	ctxt.ldt_ents = 0;
894	ctxt.gdt_frames[0] = (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT);
895	ctxt.gdt_ents      = 512;
896
897#ifdef __i386__
898	ctxt.user_regs.esp = boot_stack + PAGE_SIZE;
899
900	ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL);
901	ctxt.kernel_sp = boot_stack + PAGE_SIZE;
902
903	ctxt.event_callback_cs     = GSEL(GCODE_SEL, SEL_KPL);
904	ctxt.event_callback_eip    = (unsigned long)Xhypervisor_callback;
905	ctxt.failsafe_callback_cs  = GSEL(GCODE_SEL, SEL_KPL);
906	ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
907
908	ctxt.ctrlreg[3] = xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1]));
909#else /* __x86_64__ */
910	ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
911	ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL);
912	ctxt.kernel_sp = idle->thread.rsp0;
913
914	ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
915	ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
916	ctxt.syscall_callback_eip  = (unsigned long)system_call;
917
918	ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
919
920	ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu));
921#endif
922
923	printf("gdtpfn=%lx pdptpfn=%lx\n",
924	    ctxt.gdt_frames[0],
925	    ctxt.ctrlreg[3] >> PAGE_SHIFT);
926
927	PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt));
928	DELAY(3000);
929	PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL));
930}
931
932/*
933 * This function starts the AP (application processor) identified
934 * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
935 * to accomplish this.  This is necessary because of the nuances
936 * of the different hardware we might encounter.  It isn't pretty,
937 * but it seems to work.
938 */
939
940int cpus;
941static int
942start_ap(int apic_id)
943{
944	int ms;
945
946	/* used as a watchpoint to signal AP startup */
947	cpus = mp_naps;
948
949	cpu_initialize_context(apic_id);
950
951	/* Wait up to 5 seconds for it to start. */
952	for (ms = 0; ms < 5000; ms++) {
953		if (mp_naps > cpus)
954			return 1;	/* return SUCCESS */
955		DELAY(1000);
956	}
957	return 0;		/* return FAILURE */
958}
959
960/*
961 * Flush the TLB on all other CPU's
962 */
963static void
964smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
965{
966	u_int ncpu;
967	struct _call_data data;
968
969	ncpu = mp_ncpus - 1;	/* does not shootdown self */
970	if (ncpu < 1)
971		return;		/* no other cpus */
972	if (!(read_eflags() & PSL_I))
973		panic("%s: interrupts disabled", __func__);
974	mtx_lock_spin(&smp_ipi_mtx);
975	KASSERT(call_data == NULL, ("call_data isn't null?!"));
976	call_data = &data;
977	call_data->func_id = vector;
978	call_data->arg1 = addr1;
979	call_data->arg2 = addr2;
980	atomic_store_rel_int(&smp_tlb_wait, 0);
981	ipi_all_but_self(vector);
982	while (smp_tlb_wait < ncpu)
983		ia32_pause();
984	call_data = NULL;
985	mtx_unlock_spin(&smp_ipi_mtx);
986}
987
988static void
989smp_targeted_tlb_shootdown(cpumask_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
990{
991	int ncpu, othercpus;
992	struct _call_data data;
993
994	othercpus = mp_ncpus - 1;
995	if (mask == (u_int)-1) {
996		ncpu = othercpus;
997		if (ncpu < 1)
998			return;
999	} else {
1000		mask &= ~PCPU_GET(cpumask);
1001		if (mask == 0)
1002			return;
1003		ncpu = bitcount32(mask);
1004		if (ncpu > othercpus) {
1005			/* XXX this should be a panic offence */
1006			printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
1007			    ncpu, othercpus);
1008			ncpu = othercpus;
1009		}
1010		/* XXX should be a panic, implied by mask == 0 above */
1011		if (ncpu < 1)
1012			return;
1013	}
1014	if (!(read_eflags() & PSL_I))
1015		panic("%s: interrupts disabled", __func__);
1016	mtx_lock_spin(&smp_ipi_mtx);
1017	KASSERT(call_data == NULL, ("call_data isn't null?!"));
1018	call_data = &data;
1019	call_data->func_id = vector;
1020	call_data->arg1 = addr1;
1021	call_data->arg2 = addr2;
1022	atomic_store_rel_int(&smp_tlb_wait, 0);
1023	if (mask == (u_int)-1)
1024		ipi_all_but_self(vector);
1025	else
1026		ipi_selected(mask, vector);
1027	while (smp_tlb_wait < ncpu)
1028		ia32_pause();
1029	call_data = NULL;
1030	mtx_unlock_spin(&smp_ipi_mtx);
1031}
1032
1033void
1034smp_cache_flush(void)
1035{
1036
1037	if (smp_started)
1038		smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
1039}
1040
1041void
1042smp_invltlb(void)
1043{
1044
1045	if (smp_started) {
1046		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
1047	}
1048}
1049
1050void
1051smp_invlpg(vm_offset_t addr)
1052{
1053
1054	if (smp_started) {
1055		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
1056	}
1057}
1058
1059void
1060smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
1061{
1062
1063	if (smp_started) {
1064		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
1065	}
1066}
1067
1068void
1069smp_masked_invltlb(cpumask_t mask)
1070{
1071
1072	if (smp_started) {
1073		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
1074	}
1075}
1076
1077void
1078smp_masked_invlpg(cpumask_t mask, vm_offset_t addr)
1079{
1080
1081	if (smp_started) {
1082		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
1083	}
1084}
1085
1086void
1087smp_masked_invlpg_range(cpumask_t mask, vm_offset_t addr1, vm_offset_t addr2)
1088{
1089
1090	if (smp_started) {
1091		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
1092	}
1093}
1094
1095/*
1096 * send an IPI to a set of cpus.
1097 */
1098void
1099ipi_selected(cpumask_t cpus, u_int ipi)
1100{
1101	int cpu;
1102	u_int bitmap = 0;
1103	u_int old_pending;
1104	u_int new_pending;
1105
1106	if (IPI_IS_BITMAPED(ipi)) {
1107		bitmap = 1 << ipi;
1108		ipi = IPI_BITMAP_VECTOR;
1109	}
1110
1111	/*
1112	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1113	 * of help in order to understand what is the source.
1114	 * Set the mask of receiving CPUs for this purpose.
1115	 */
1116	if (ipi == IPI_STOP_HARD)
1117		atomic_set_int(&ipi_nmi_pending, cpus);
1118
1119	CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi);
1120	while ((cpu = ffs(cpus)) != 0) {
1121		cpu--;
1122		cpus &= ~(1 << cpu);
1123
1124		KASSERT(cpu_apic_ids[cpu] != -1,
1125		    ("IPI to non-existent CPU %d", cpu));
1126
1127		if (bitmap) {
1128			do {
1129				old_pending = cpu_ipi_pending[cpu];
1130				new_pending = old_pending | bitmap;
1131			} while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],old_pending, new_pending));
1132
1133			if (!old_pending)
1134				ipi_pcpu(cpu, RESCHEDULE_VECTOR);
1135			continue;
1136
1137		} else {
1138			KASSERT(call_data != NULL, ("call_data not set"));
1139			ipi_pcpu(cpu, CALL_FUNCTION_VECTOR);
1140		}
1141	}
1142}
1143
1144/*
1145 * send an IPI to all CPUs EXCEPT myself
1146 */
1147void
1148ipi_all_but_self(u_int ipi)
1149{
1150
1151	/*
1152	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1153	 * of help in order to understand what is the source.
1154	 * Set the mask of receiving CPUs for this purpose.
1155	 */
1156	if (ipi == IPI_STOP_HARD)
1157		atomic_set_int(&ipi_nmi_pending, PCPU_GET(other_cpus));
1158
1159	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1160	ipi_selected(PCPU_GET(other_cpus), ipi);
1161}
1162
1163int
1164ipi_nmi_handler()
1165{
1166	cpumask_t cpumask;
1167
1168	/*
1169	 * As long as there is not a simple way to know about a NMI's
1170	 * source, if the bitmask for the current CPU is present in
1171	 * the global pending bitword an IPI_STOP_HARD has been issued
1172	 * and should be handled.
1173	 */
1174	cpumask = PCPU_GET(cpumask);
1175	if ((ipi_nmi_pending & cpumask) == 0)
1176		return (1);
1177
1178	atomic_clear_int(&ipi_nmi_pending, cpumask);
1179	cpustop_handler();
1180	return (0);
1181}
1182
1183/*
1184 * Handle an IPI_STOP by saving our current context and spinning until we
1185 * are resumed.
1186 */
1187void
1188cpustop_handler(void)
1189{
1190	int cpu = PCPU_GET(cpuid);
1191	int cpumask = PCPU_GET(cpumask);
1192
1193	savectx(&stoppcbs[cpu]);
1194
1195	/* Indicate that we are stopped */
1196	atomic_set_int(&stopped_cpus, cpumask);
1197
1198	/* Wait for restart */
1199	while (!(started_cpus & cpumask))
1200	    ia32_pause();
1201
1202	atomic_clear_int(&started_cpus, cpumask);
1203	atomic_clear_int(&stopped_cpus, cpumask);
1204
1205	if (cpu == 0 && cpustop_restartfunc != NULL) {
1206		cpustop_restartfunc();
1207		cpustop_restartfunc = NULL;
1208	}
1209}
1210
1211/*
1212 * This is called once the rest of the system is up and running and we're
1213 * ready to let the AP's out of the pen.
1214 */
1215static void
1216release_aps(void *dummy __unused)
1217{
1218
1219	if (mp_ncpus == 1)
1220		return;
1221	atomic_store_rel_int(&aps_ready, 1);
1222	while (smp_started == 0)
1223		ia32_pause();
1224}
1225SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1226SYSINIT(start_ipis, SI_SUB_INTR, SI_ORDER_ANY, xen_smp_intr_init_cpus, NULL);
1227
1228