mp_machdep.c revision 184112
1/*-
2 * Copyright (c) 1996, by Steve Passe
3 * Copyright (c) 2008, by Kip Macy
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. The name of the developer may NOT be used to endorse or promote products
12 *    derived from this software without specific prior written permission.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/i386/xen/mp_machdep.c 184112 2008-10-21 06:39:40Z kmacy $");
29
30#include "opt_apic.h"
31#include "opt_cpu.h"
32#include "opt_kstack_pages.h"
33#include "opt_mp_watchdog.h"
34#include "opt_sched.h"
35#include "opt_smp.h"
36
37#if !defined(lint)
38#if !defined(SMP)
39#error How did you get here?
40#endif
41
42#ifndef DEV_APIC
43#error The apic device is required for SMP, add "device apic" to your config file.
44#endif
45#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT)
46#error SMP not supported with CPU_DISABLE_CMPXCHG
47#endif
48#endif /* not lint */
49
50#include <sys/param.h>
51#include <sys/systm.h>
52#include <sys/bus.h>
53#include <sys/cons.h>	/* cngetc() */
54#ifdef GPROF
55#include <sys/gmon.h>
56#endif
57#include <sys/kernel.h>
58#include <sys/ktr.h>
59#include <sys/lock.h>
60#include <sys/malloc.h>
61#include <sys/memrange.h>
62#include <sys/mutex.h>
63#include <sys/pcpu.h>
64#include <sys/proc.h>
65#include <sys/sched.h>
66#include <sys/smp.h>
67#include <sys/sysctl.h>
68
69#include <vm/vm.h>
70#include <vm/vm_param.h>
71#include <vm/pmap.h>
72#include <vm/vm_kern.h>
73#include <vm/vm_extern.h>
74#include <vm/vm_page.h>
75
76#include <machine/apicreg.h>
77#include <machine/md_var.h>
78#include <machine/mp_watchdog.h>
79#include <machine/pcb.h>
80#include <machine/psl.h>
81#include <machine/smp.h>
82#include <machine/specialreg.h>
83#include <machine/pcpu.h>
84
85
86
87#include <machine/xen/xen-os.h>
88#include <machine/xen/evtchn.h>
89#include <machine/xen/xen_intr.h>
90#include <machine/xen/hypervisor.h>
91#include <xen/interface/vcpu.h>
92
93#define stop_cpus_with_nmi	0
94
95
96int	mp_naps;		/* # of Applications processors */
97int	boot_cpu_id = -1;	/* designated BSP */
98
99extern	struct pcpu __pcpu[];
100
101static int bootAP;
102static union descriptor *bootAPgdt;
103
104static DEFINE_PER_CPU(int, resched_irq);
105static DEFINE_PER_CPU(int, callfunc_irq);
106static char resched_name[NR_CPUS][15];
107static char callfunc_name[NR_CPUS][15];
108
109/* Free these after use */
110void *bootstacks[MAXCPU];
111
112/* Hotwire a 0->4MB V==P mapping */
113extern pt_entry_t *KPTphys;
114
115struct pcb stoppcbs[MAXCPU];
116
117/* Variables needed for SMP tlb shootdown. */
118vm_offset_t smp_tlb_addr1;
119vm_offset_t smp_tlb_addr2;
120volatile int smp_tlb_wait;
121
122typedef void call_data_func_t(uintptr_t , uintptr_t);
123
124static u_int logical_cpus;
125
126/* used to hold the AP's until we are ready to release them */
127static struct mtx ap_boot_mtx;
128
129/* Set to 1 once we're ready to let the APs out of the pen. */
130static volatile int aps_ready = 0;
131
132/*
133 * Store data from cpu_add() until later in the boot when we actually setup
134 * the APs.
135 */
136struct cpu_info {
137	int	cpu_present:1;
138	int	cpu_bsp:1;
139	int	cpu_disabled:1;
140} static cpu_info[MAX_APIC_ID + 1];
141int cpu_apic_ids[MAXCPU];
142
143/* Holds pending bitmap based IPIs per CPU */
144static volatile u_int cpu_ipi_pending[MAXCPU];
145
146static void	assign_cpu_ids(void);
147static void	set_interrupt_apic_ids(void);
148int	start_all_aps(void);
149static int	start_ap(int apic_id);
150static void	release_aps(void *dummy);
151
152static u_int	hyperthreading_cpus;
153static cpumask_t	hyperthreading_cpus_mask;
154
155extern void Xhypervisor_callback(void);
156extern void failsafe_callback(void);
157
158struct cpu_group *
159cpu_topo(void)
160{
161	if (cpu_cores == 0)
162		cpu_cores = 1;
163	if (cpu_logical == 0)
164		cpu_logical = 1;
165	if (mp_ncpus % (cpu_cores * cpu_logical) != 0) {
166		printf("WARNING: Non-uniform processors.\n");
167		printf("WARNING: Using suboptimal topology.\n");
168		return (smp_topo_none());
169	}
170	/*
171	 * No multi-core or hyper-threaded.
172	 */
173	if (cpu_logical * cpu_cores == 1)
174		return (smp_topo_none());
175	/*
176	 * Only HTT no multi-core.
177	 */
178	if (cpu_logical > 1 && cpu_cores == 1)
179		return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
180	/*
181	 * Only multi-core no HTT.
182	 */
183	if (cpu_cores > 1 && cpu_logical == 1)
184		return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0));
185	/*
186	 * Both HTT and multi-core.
187	 */
188	return (smp_topo_2level(CG_SHARE_NONE, cpu_cores,
189	    CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
190}
191
192/*
193 * Calculate usable address in base memory for AP trampoline code.
194 */
195u_int
196mp_bootaddress(u_int basemem)
197{
198
199	return (basemem);
200}
201
202void
203cpu_add(u_int apic_id, char boot_cpu)
204{
205
206	if (apic_id > MAX_APIC_ID) {
207		panic("SMP: APIC ID %d too high", apic_id);
208		return;
209	}
210	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
211	    apic_id));
212	cpu_info[apic_id].cpu_present = 1;
213	if (boot_cpu) {
214		KASSERT(boot_cpu_id == -1,
215		    ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
216		    boot_cpu_id));
217		boot_cpu_id = apic_id;
218		cpu_info[apic_id].cpu_bsp = 1;
219	}
220	if (mp_ncpus < MAXCPU)
221		mp_ncpus++;
222	if (bootverbose)
223		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
224		    "AP");
225}
226
227void
228cpu_mp_setmaxid(void)
229{
230
231	mp_maxid = MAXCPU - 1;
232}
233
234int
235cpu_mp_probe(void)
236{
237
238	/*
239	 * Always record BSP in CPU map so that the mbuf init code works
240	 * correctly.
241	 */
242	all_cpus = 1;
243	if (mp_ncpus == 0) {
244		/*
245		 * No CPUs were found, so this must be a UP system.  Setup
246		 * the variables to represent a system with a single CPU
247		 * with an id of 0.
248		 */
249		mp_ncpus = 1;
250		return (0);
251	}
252
253	/* At least one CPU was found. */
254	if (mp_ncpus == 1) {
255		/*
256		 * One CPU was found, so this must be a UP system with
257		 * an I/O APIC.
258		 */
259		return (0);
260	}
261
262	/* At least two CPUs were found. */
263	return (1);
264}
265
266/*
267 * Initialize the IPI handlers and start up the AP's.
268 */
269void
270cpu_mp_start(void)
271{
272	int i;
273
274	/* Initialize the logical ID to APIC ID table. */
275	for (i = 0; i < MAXCPU; i++) {
276		cpu_apic_ids[i] = -1;
277		cpu_ipi_pending[i] = 0;
278	}
279
280	/* Set boot_cpu_id if needed. */
281	if (boot_cpu_id == -1) {
282		boot_cpu_id = PCPU_GET(apic_id);
283		cpu_info[boot_cpu_id].cpu_bsp = 1;
284	} else
285		KASSERT(boot_cpu_id == PCPU_GET(apic_id),
286		    ("BSP's APIC ID doesn't match boot_cpu_id"));
287	cpu_apic_ids[0] = boot_cpu_id;
288
289	assign_cpu_ids();
290
291	/* Start each Application Processor */
292	start_all_aps();
293
294	/* Setup the initial logical CPUs info. */
295	logical_cpus = logical_cpus_mask = 0;
296	if (cpu_feature & CPUID_HTT)
297		logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
298
299	set_interrupt_apic_ids();
300}
301
302
303static void
304iv_rendezvous(uintptr_t a, uintptr_t b)
305{
306
307}
308
309static void
310iv_invltlb(uintptr_t a, uintptr_t b)
311{
312
313}
314
315static void
316iv_invlpg(uintptr_t a, uintptr_t b)
317{
318
319}
320
321static void
322iv_invlrng(uintptr_t a, uintptr_t b)
323{
324
325}
326
327static void
328iv_invlcache(uintptr_t a, uintptr_t b)
329{
330
331}
332
333static void
334iv_lazypmap(uintptr_t a, uintptr_t b)
335{
336
337}
338
339static void
340iv_bitmap_vector(uintptr_t a, uintptr_t b)
341{
342
343}
344
345
346static call_data_func_t *ipi_vectors[IPI_BITMAP_VECTOR + 1] =
347{ iv_rendezvous,
348  iv_invltlb,
349  iv_invlpg,
350  iv_invlrng,
351  iv_invlcache,
352  iv_lazypmap,
353  iv_bitmap_vector
354};
355
356/*
357 * Reschedule call back. Nothing to do,
358 * all the work is done automatically when
359 * we return from the interrupt.
360 */
361static void
362smp_reschedule_interrupt(void *unused)
363{
364}
365
366struct _call_data {
367	call_data_func_t *func;
368	uintptr_t arg1;
369	uintptr_t arg2;
370	atomic_t started;
371	atomic_t finished;
372	int wait;
373};
374
375static struct _call_data *call_data;
376
377static void
378smp_call_function_interrupt(void *unused)
379{
380	call_data_func_t *func = call_data->func;
381	uintptr_t arg1 = call_data->arg1;
382	uintptr_t arg2 = call_data->arg2;
383	int wait = call_data->wait;
384
385	/*
386	 * Notify initiating CPU that I've grabbed the data and am
387	 * about to execute the function
388	 */
389	mb();
390	atomic_inc(&call_data->started);
391	/*
392	 * At this point the info structure may be out of scope unless wait==1
393	 */
394	(*func)(arg1, arg2);
395
396	if (wait) {
397		mb();
398		atomic_inc(&call_data->finished);
399	}
400}
401
402/*
403 * Print various information about the SMP system hardware and setup.
404 */
405void
406cpu_mp_announce(void)
407{
408	int i, x;
409
410	/* List CPUs */
411	printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
412	for (i = 1, x = 0; x <= MAX_APIC_ID; x++) {
413		if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp)
414			continue;
415		if (cpu_info[x].cpu_disabled)
416			printf("  cpu (AP): APIC ID: %2d (disabled)\n", x);
417		else {
418			KASSERT(i < mp_ncpus,
419			    ("mp_ncpus and actual cpus are out of whack"));
420			printf(" cpu%d (AP): APIC ID: %2d\n", i++, x);
421		}
422	}
423}
424
425
426static int
427xen_smp_intr_init(unsigned int cpu)
428{
429	int rc;
430
431	per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
432
433	sprintf(resched_name[cpu], "resched%u", cpu);
434	rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
435				    cpu,
436				    resched_name[cpu],
437				    smp_reschedule_interrupt,
438				    INTR_FAST);
439
440	per_cpu(resched_irq, cpu) = rc;
441
442	sprintf(callfunc_name[cpu], "callfunc%u", cpu);
443	rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR,
444				    cpu,
445				    callfunc_name[cpu],
446				    smp_call_function_interrupt,
447				    INTR_FAST);
448	if (rc < 0)
449		goto fail;
450	per_cpu(callfunc_irq, cpu) = rc;
451
452	if ((cpu != 0) && ((rc = ap_cpu_initclocks(cpu)) != 0))
453		goto fail;
454
455	return 0;
456
457 fail:
458	if (per_cpu(resched_irq, cpu) >= 0)
459		unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
460	if (per_cpu(callfunc_irq, cpu) >= 0)
461		unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
462	return rc;
463}
464
465#define MTOPSIZE (1<<(14 + PAGE_SHIFT))
466
467/*
468 * AP CPU's call this to initialize themselves.
469 */
470void
471init_secondary(void)
472{
473	vm_offset_t addr;
474	int	gsel_tss;
475
476
477	/* bootAP is set in start_ap() to our ID. */
478	PCPU_SET(currentldt, _default_ldt);
479	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
480#if 0
481	gdt[bootAP * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
482#endif
483	PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */
484	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
485	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
486#if 0
487	PCPU_SET(tss_gdt, &gdt[bootAP * NGDT + GPROC0_SEL].sd);
488
489	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
490#endif
491	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
492
493	/*
494	 * Set to a known state:
495	 * Set by mpboot.s: CR0_PG, CR0_PE
496	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
497	 */
498	/*
499	 * signal our startup to the BSP.
500	 */
501	mp_naps++;
502
503	/* Spin until the BSP releases the AP's. */
504	while (!aps_ready)
505		ia32_pause();
506
507	/* BSP may have changed PTD while we were waiting */
508	invltlb();
509	for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE)
510		invlpg(addr);
511
512	/* set up FPU state on the AP */
513	npxinit(__INITIAL_NPXCW__);
514#if 0
515
516	/* set up SSE registers */
517	enable_sse();
518#endif
519#if 0 && defined(PAE)
520	/* Enable the PTE no-execute bit. */
521	if ((amd_feature & AMDID_NX) != 0) {
522		uint64_t msr;
523
524		msr = rdmsr(MSR_EFER) | EFER_NXE;
525		wrmsr(MSR_EFER, msr);
526	}
527#endif
528#if 0
529	/* A quick check from sanity claus */
530	if (PCPU_GET(apic_id) != lapic_id()) {
531		printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
532		printf("SMP: actual apic_id = %d\n", lapic_id());
533		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
534		panic("cpuid mismatch! boom!!");
535	}
536#endif
537
538	/* Initialize curthread. */
539	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
540	PCPU_SET(curthread, PCPU_GET(idlethread));
541
542	mtx_lock_spin(&ap_boot_mtx);
543#if 0
544
545	/* Init local apic for irq's */
546	lapic_setup(1);
547#endif
548	smp_cpus++;
549
550	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid));
551	printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
552
553	/* Determine if we are a logical CPU. */
554	if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0)
555		logical_cpus_mask |= PCPU_GET(cpumask);
556
557	/* Determine if we are a hyperthread. */
558	if (hyperthreading_cpus > 1 &&
559	    PCPU_GET(apic_id) % hyperthreading_cpus != 0)
560		hyperthreading_cpus_mask |= PCPU_GET(cpumask);
561
562	/* Build our map of 'other' CPUs. */
563	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
564#if 0
565	if (bootverbose)
566		lapic_dump("AP");
567#endif
568	if (smp_cpus == mp_ncpus) {
569		/* enable IPI's, tlb shootdown, freezes etc */
570		atomic_store_rel_int(&smp_started, 1);
571		smp_active = 1;	 /* historic */
572	}
573
574	xen_smp_intr_init(bootAP);
575	mtx_unlock_spin(&ap_boot_mtx);
576
577	/* wait until all the AP's are up */
578	while (smp_started == 0)
579		ia32_pause();
580
581
582	PCPU_SET(curthread, PCPU_GET(idlethread));
583	/* enter the scheduler */
584	sched_throw(NULL);
585
586	panic("scheduler returned us to %s", __func__);
587	/* NOTREACHED */
588}
589
590/*******************************************************************
591 * local functions and data
592 */
593
594/*
595 * We tell the I/O APIC code about all the CPUs we want to receive
596 * interrupts.  If we don't want certain CPUs to receive IRQs we
597 * can simply not tell the I/O APIC code about them in this function.
598 * We also do not tell it about the BSP since it tells itself about
599 * the BSP internally to work with UP kernels and on UP machines.
600 */
601static void
602set_interrupt_apic_ids(void)
603{
604	u_int i, apic_id;
605
606	for (i = 0; i < MAXCPU; i++) {
607		apic_id = cpu_apic_ids[i];
608		if (apic_id == -1)
609			continue;
610		if (cpu_info[apic_id].cpu_bsp)
611			continue;
612		if (cpu_info[apic_id].cpu_disabled)
613			continue;
614
615		/* Don't let hyperthreads service interrupts. */
616		if (hyperthreading_cpus > 1 &&
617		    apic_id % hyperthreading_cpus != 0)
618			continue;
619
620		intr_add_cpu(i);
621	}
622}
623
624/*
625 * Assign logical CPU IDs to local APICs.
626 */
627static void
628assign_cpu_ids(void)
629{
630	u_int i;
631
632	/* Check for explicitly disabled CPUs. */
633	for (i = 0; i <= MAX_APIC_ID; i++) {
634		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp)
635			continue;
636
637		/* Don't use this CPU if it has been disabled by a tunable. */
638		if (resource_disabled("lapic", i)) {
639			cpu_info[i].cpu_disabled = 1;
640			continue;
641		}
642	}
643
644	/*
645	 * Assign CPU IDs to local APIC IDs and disable any CPUs
646	 * beyond MAXCPU.  CPU 0 has already been assigned to the BSP,
647	 * so we only have to assign IDs for APs.
648	 */
649	mp_ncpus = 1;
650	for (i = 0; i <= MAX_APIC_ID; i++) {
651		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp ||
652		    cpu_info[i].cpu_disabled)
653			continue;
654
655		if (mp_ncpus < MAXCPU) {
656			cpu_apic_ids[mp_ncpus] = i;
657			mp_ncpus++;
658		} else
659			cpu_info[i].cpu_disabled = 1;
660	}
661	KASSERT(mp_maxid >= mp_ncpus - 1,
662	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
663	    mp_ncpus));
664}
665
666/*
667 * start each AP in our list
668 */
669/* Lowest 1MB is already mapped: don't touch*/
670#define TMPMAP_START 1
671int
672start_all_aps(void)
673{
674	int x,apic_id, cpu;
675	struct pcpu *pc;
676
677	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
678
679	/* set up temporary P==V mapping for AP boot */
680	/* XXX this is a hack, we should boot the AP on its own stack/PTD */
681
682	/* start each AP */
683	for (cpu = 1; cpu < mp_ncpus; cpu++) {
684		apic_id = cpu_apic_ids[cpu];
685
686
687		bootAP = cpu;
688		bootAPgdt = gdt + (512*cpu);
689
690		/* Get per-cpu data */
691		pc = &__pcpu[bootAP];
692		pcpu_init(pc, bootAP, sizeof(struct pcpu));
693		pc->pc_apic_id = cpu_apic_ids[bootAP];
694		pc->pc_prvspace = pc;
695		pc->pc_curthread = 0;
696
697		gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
698		gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
699
700		PT_SET_MA(bootAPgdt, xpmap_ptom(VTOP(bootAPgdt)) | PG_V | PG_RW);
701		bzero(bootAPgdt, PAGE_SIZE);
702		for (x = 0; x < NGDT; x++)
703			ssdtosd(&gdt_segs[x], &bootAPgdt[x].sd);
704		PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V);
705#ifdef notyet
706
707                if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) {
708                        apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id);
709                        acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id);
710#ifdef CONFIG_ACPI
711                        if (acpiid != 0xff)
712                                x86_acpiid_to_apicid[acpiid] = apicid;
713#endif
714                }
715#endif
716
717		/* attempt to start the Application Processor */
718		if (!start_ap(cpu)) {
719			printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id);
720			/* better panic as the AP may be running loose */
721			printf("panic y/n? [y] ");
722			if (cngetc() != 'n')
723				panic("bye-bye");
724		}
725
726		all_cpus |= (1 << cpu);		/* record AP in CPU map */
727	}
728
729
730	/* build our map of 'other' CPUs */
731	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
732
733	pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1);
734
735	/* number of APs actually started */
736	return mp_naps;
737}
738
739extern uint8_t *pcpu_boot_stack;
740extern trap_info_t trap_table[];
741
742static void
743smp_trap_init(trap_info_t *trap_ctxt)
744{
745        const trap_info_t *t = trap_table;
746
747        for (t = trap_table; t->address; t++) {
748                trap_ctxt[t->vector].flags = t->flags;
749                trap_ctxt[t->vector].cs = t->cs;
750                trap_ctxt[t->vector].address = t->address;
751        }
752}
753
754extern int nkpt;
755static void
756cpu_initialize_context(unsigned int cpu)
757{
758	/* vcpu_guest_context_t is too large to allocate on the stack.
759	 * Hence we allocate statically and protect it with a lock */
760	vm_page_t m[4];
761	static vcpu_guest_context_t ctxt;
762	vm_offset_t boot_stack;
763	vm_offset_t newPTD;
764	vm_paddr_t ma[NPGPTD];
765	static int color;
766	int i;
767
768	/*
769	 * Page 0,[0-3]	PTD
770	 * Page 1, [4]	boot stack
771	 * Page [5]	PDPT
772	 *
773	 */
774	for (i = 0; i < NPGPTD + 2; i++) {
775		m[i] = vm_page_alloc(NULL, color++,
776		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
777		    VM_ALLOC_ZERO);
778
779		pmap_zero_page(m[i]);
780
781	}
782	boot_stack = kmem_alloc_nofault(kernel_map, 1);
783	newPTD = kmem_alloc_nofault(kernel_map, NPGPTD);
784	ma[0] = xpmap_ptom(VM_PAGE_TO_PHYS(m[0]))|PG_V;
785
786#ifdef PAE
787	pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD + 1]));
788	for (i = 0; i < NPGPTD; i++) {
789		((vm_paddr_t *)boot_stack)[i] =
790		ma[i] =
791		    xpmap_ptom(VM_PAGE_TO_PHYS(m[i]))|PG_V;
792	}
793#endif
794
795	/*
796	 * Copy cpu0 IdlePTD to new IdlePTD - copying only
797	 * kernel mappings
798	 */
799	pmap_qenter(newPTD, m, 4);
800
801	memcpy((uint8_t *)newPTD + KPTDI*sizeof(vm_paddr_t),
802	    (uint8_t *)PTOV(IdlePTD) + KPTDI*sizeof(vm_paddr_t),
803	    nkpt*sizeof(vm_paddr_t));
804
805	pmap_qremove(newPTD, 4);
806	kmem_free(kernel_map, newPTD, 4);
807	/*
808	 * map actual idle stack to boot_stack
809	 */
810	pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD]));
811
812
813	xen_pgdpt_pin(xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1])));
814	vm_page_lock_queues();
815	for (i = 0; i < 4; i++) {
816		int pdir = (PTDPTDI + i) / NPDEPG;
817		int curoffset = (PTDPTDI + i) % NPDEPG;
818
819		xen_queue_pt_update((vm_paddr_t)
820		    ((ma[pdir] & ~PG_V) + (curoffset*sizeof(vm_paddr_t))),
821		    ma[i]);
822	}
823	PT_UPDATES_FLUSH();
824	vm_page_unlock_queues();
825
826	memset(&ctxt, 0, sizeof(ctxt));
827	ctxt.flags = VGCF_IN_KERNEL;
828	ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL);
829	ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL);
830	ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL);
831	ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL);
832	ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL);
833	ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL);
834	ctxt.user_regs.eip = (unsigned long)init_secondary;
835	ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */
836
837	memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
838
839	smp_trap_init(ctxt.trap_ctxt);
840
841	ctxt.ldt_ents = 0;
842	ctxt.gdt_frames[0] = (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT);
843	ctxt.gdt_ents      = 512;
844
845#ifdef __i386__
846	ctxt.user_regs.esp = boot_stack + PAGE_SIZE;
847
848	ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL);
849	ctxt.kernel_sp = boot_stack + PAGE_SIZE;
850
851	ctxt.event_callback_cs     = GSEL(GCODE_SEL, SEL_KPL);
852	ctxt.event_callback_eip    = (unsigned long)Xhypervisor_callback;
853	ctxt.failsafe_callback_cs  = GSEL(GCODE_SEL, SEL_KPL);
854	ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
855
856	ctxt.ctrlreg[3] = xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1]));
857#else /* __x86_64__ */
858	ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
859	ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL);
860	ctxt.kernel_sp = idle->thread.rsp0;
861
862	ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
863	ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
864	ctxt.syscall_callback_eip  = (unsigned long)system_call;
865
866	ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
867
868	ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu));
869#endif
870
871	printf("gdtpfn=%lx pdptpfn=%lx\n",
872	    ctxt.gdt_frames[0],
873	    ctxt.ctrlreg[3] >> PAGE_SHIFT);
874
875	PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt));
876	DELAY(3000);
877	PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL));
878}
879
880/*
881 * This function starts the AP (application processor) identified
882 * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
883 * to accomplish this.  This is necessary because of the nuances
884 * of the different hardware we might encounter.  It isn't pretty,
885 * but it seems to work.
886 */
887
888int cpus;
889static int
890start_ap(int apic_id)
891{
892	int ms;
893
894	/* used as a watchpoint to signal AP startup */
895	cpus = mp_naps;
896
897	cpu_initialize_context(apic_id);
898
899	/* Wait up to 5 seconds for it to start. */
900	for (ms = 0; ms < 5000; ms++) {
901		if (mp_naps > cpus)
902			return 1;	/* return SUCCESS */
903		DELAY(1000);
904	}
905	return 0;		/* return FAILURE */
906}
907
908/*
909 * Flush the TLB on all other CPU's
910 */
911static void
912smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
913{
914	u_int ncpu;
915
916	ncpu = mp_ncpus - 1;	/* does not shootdown self */
917	if (ncpu < 1)
918		return;		/* no other cpus */
919	if (!(read_eflags() & PSL_I))
920		panic("%s: interrupts disabled", __func__);
921	mtx_lock_spin(&smp_ipi_mtx);
922	call_data->func = ipi_vectors[vector];
923	call_data->arg1 = addr1;
924	call_data->arg2 = addr2;
925	atomic_store_rel_int(&smp_tlb_wait, 0);
926	ipi_all_but_self(vector);
927	while (smp_tlb_wait < ncpu)
928		ia32_pause();
929	mtx_unlock_spin(&smp_ipi_mtx);
930}
931
932static void
933smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
934{
935	int ncpu, othercpus;
936
937	othercpus = mp_ncpus - 1;
938	if (mask == (u_int)-1) {
939		ncpu = othercpus;
940		if (ncpu < 1)
941			return;
942	} else {
943		mask &= ~PCPU_GET(cpumask);
944		if (mask == 0)
945			return;
946		ncpu = bitcount32(mask);
947		if (ncpu > othercpus) {
948			/* XXX this should be a panic offence */
949			printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
950			    ncpu, othercpus);
951			ncpu = othercpus;
952		}
953		/* XXX should be a panic, implied by mask == 0 above */
954		if (ncpu < 1)
955			return;
956	}
957	if (!(read_eflags() & PSL_I))
958		panic("%s: interrupts disabled", __func__);
959	mtx_lock_spin(&smp_ipi_mtx);
960	smp_tlb_addr1 = addr1;
961	smp_tlb_addr2 = addr2;
962	atomic_store_rel_int(&smp_tlb_wait, 0);
963	if (mask == (u_int)-1)
964		ipi_all_but_self(vector);
965	else
966		ipi_selected(mask, vector);
967	while (smp_tlb_wait < ncpu)
968		ia32_pause();
969	mtx_unlock_spin(&smp_ipi_mtx);
970}
971
972void
973smp_cache_flush(void)
974{
975
976	if (smp_started)
977		smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
978}
979
980void
981smp_invltlb(void)
982{
983
984	if (smp_started) {
985		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
986	}
987}
988
989void
990smp_invlpg(vm_offset_t addr)
991{
992
993	if (smp_started) {
994		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
995	}
996}
997
998void
999smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
1000{
1001
1002	if (smp_started) {
1003		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
1004	}
1005}
1006
1007void
1008smp_masked_invltlb(u_int mask)
1009{
1010
1011	if (smp_started) {
1012		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
1013	}
1014}
1015
1016void
1017smp_masked_invlpg(u_int mask, vm_offset_t addr)
1018{
1019
1020	if (smp_started) {
1021		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
1022	}
1023}
1024
1025void
1026smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2)
1027{
1028
1029	if (smp_started) {
1030		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
1031	}
1032}
1033
1034void
1035ipi_bitmap_handler(struct trapframe frame)
1036{
1037	int cpu = PCPU_GET(cpuid);
1038	u_int ipi_bitmap;
1039
1040	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
1041
1042	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
1043		sched_preempt(curthread);
1044	}
1045}
1046
1047/*
1048 * send an IPI to a set of cpus.
1049 */
1050void
1051ipi_selected(u_int32_t cpus, u_int ipi)
1052{
1053	int cpu;
1054	u_int bitmap = 0;
1055	u_int old_pending;
1056	u_int new_pending;
1057
1058	if (IPI_IS_BITMAPED(ipi)) {
1059		bitmap = 1 << ipi;
1060		ipi = IPI_BITMAP_VECTOR;
1061	}
1062
1063#ifdef STOP_NMI
1064	if (ipi == IPI_STOP && stop_cpus_with_nmi) {
1065		ipi_nmi_selected(cpus);
1066		return;
1067	}
1068#endif
1069	CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi);
1070	while ((cpu = ffs(cpus)) != 0) {
1071		cpu--;
1072		cpus &= ~(1 << cpu);
1073
1074		KASSERT(cpu_apic_ids[cpu] != -1,
1075		    ("IPI to non-existent CPU %d", cpu));
1076
1077		if (bitmap) {
1078			do {
1079				old_pending = cpu_ipi_pending[cpu];
1080				new_pending = old_pending | bitmap;
1081			} while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],old_pending, new_pending));
1082
1083			if (old_pending)
1084				continue;
1085		}
1086
1087		ipi_pcpu(cpu, ipi);
1088	}
1089}
1090
1091/*
1092 * send an IPI to all CPUs EXCEPT myself
1093 */
1094void
1095ipi_all_but_self(u_int ipi)
1096{
1097
1098	if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) {
1099		ipi_selected(PCPU_GET(other_cpus), ipi);
1100		return;
1101	}
1102	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1103	ipi_selected(((int)-1 & ~(1 << curcpu)), ipi);
1104}
1105
1106#ifdef STOP_NMI
1107/*
1108 * send NMI IPI to selected CPUs
1109 */
1110
1111#define	BEFORE_SPIN	1000000
1112
1113void
1114ipi_nmi_selected(u_int32_t cpus)
1115{
1116	int cpu;
1117	register_t icrlo;
1118
1119	icrlo = APIC_DELMODE_NMI | APIC_DESTMODE_PHY | APIC_LEVEL_ASSERT
1120		| APIC_TRIGMOD_EDGE;
1121
1122	CTR2(KTR_SMP, "%s: cpus: %x nmi", __func__, cpus);
1123
1124	atomic_set_int(&ipi_nmi_pending, cpus);
1125
1126	while ((cpu = ffs(cpus)) != 0) {
1127		cpu--;
1128		cpus &= ~(1 << cpu);
1129
1130		KASSERT(cpu_apic_ids[cpu] != -1,
1131		    ("IPI NMI to non-existent CPU %d", cpu));
1132
1133		/* Wait for an earlier IPI to finish. */
1134		if (!lapic_ipi_wait(BEFORE_SPIN))
1135			panic("ipi_nmi_selected: previous IPI has not cleared");
1136
1137		lapic_ipi_raw(icrlo, cpu_apic_ids[cpu]);
1138	}
1139}
1140
1141int
1142ipi_nmi_handler(void)
1143{
1144	int cpumask = PCPU_GET(cpumask);
1145
1146	if (!(ipi_nmi_pending & cpumask))
1147		return 1;
1148
1149	atomic_clear_int(&ipi_nmi_pending, cpumask);
1150	cpustop_handler();
1151	return 0;
1152}
1153
1154#endif /* STOP_NMI */
1155
1156/*
1157 * Handle an IPI_STOP by saving our current context and spinning until we
1158 * are resumed.
1159 */
1160void
1161cpustop_handler(void)
1162{
1163	int cpu = PCPU_GET(cpuid);
1164	int cpumask = PCPU_GET(cpumask);
1165
1166	savectx(&stoppcbs[cpu]);
1167
1168	/* Indicate that we are stopped */
1169	atomic_set_int(&stopped_cpus, cpumask);
1170
1171	/* Wait for restart */
1172	while (!(started_cpus & cpumask))
1173	    ia32_pause();
1174
1175	atomic_clear_int(&started_cpus, cpumask);
1176	atomic_clear_int(&stopped_cpus, cpumask);
1177
1178	if (cpu == 0 && cpustop_restartfunc != NULL) {
1179		cpustop_restartfunc();
1180		cpustop_restartfunc = NULL;
1181	}
1182}
1183
1184/*
1185 * This is called once the rest of the system is up and running and we're
1186 * ready to let the AP's out of the pen.
1187 */
1188static void
1189release_aps(void *dummy __unused)
1190{
1191
1192	if (mp_ncpus == 1)
1193		return;
1194	atomic_store_rel_int(&aps_ready, 1);
1195	while (smp_started == 0)
1196		ia32_pause();
1197}
1198SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1199
1200