1182902Skmacy/*-
2182902Skmacy * Copyright (c) 1996, by Steve Passe
3182902Skmacy * Copyright (c) 2008, by Kip Macy
4182902Skmacy * All rights reserved.
5182902Skmacy *
6182902Skmacy * Redistribution and use in source and binary forms, with or without
7182902Skmacy * modification, are permitted provided that the following conditions
8182902Skmacy * are met:
9182902Skmacy * 1. Redistributions of source code must retain the above copyright
10182902Skmacy *    notice, this list of conditions and the following disclaimer.
11182902Skmacy * 2. The name of the developer may NOT be used to endorse or promote products
12182902Skmacy *    derived from this software without specific prior written permission.
13182902Skmacy *
14182902Skmacy * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15182902Skmacy * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16182902Skmacy * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17182902Skmacy * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18182902Skmacy * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19182902Skmacy * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20182902Skmacy * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21182902Skmacy * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22182902Skmacy * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23182902Skmacy * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24182902Skmacy * SUCH DAMAGE.
25182902Skmacy */
26182902Skmacy
27182902Skmacy#include <sys/cdefs.h>
28182902Skmacy__FBSDID("$FreeBSD$");
29182902Skmacy
30182902Skmacy#include "opt_apic.h"
31182902Skmacy#include "opt_cpu.h"
32182902Skmacy#include "opt_kstack_pages.h"
33182902Skmacy#include "opt_mp_watchdog.h"
34204972Sjhb#include "opt_pmap.h"
35182902Skmacy#include "opt_sched.h"
36182902Skmacy#include "opt_smp.h"
37182902Skmacy
38182902Skmacy#if !defined(lint)
39182902Skmacy#if !defined(SMP)
40182902Skmacy#error How did you get here?
41182902Skmacy#endif
42182902Skmacy
43182902Skmacy#ifndef DEV_APIC
44182902Skmacy#error The apic device is required for SMP, add "device apic" to your config file.
45182902Skmacy#endif
46182902Skmacy#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT)
47182902Skmacy#error SMP not supported with CPU_DISABLE_CMPXCHG
48182902Skmacy#endif
49182902Skmacy#endif /* not lint */
50182902Skmacy
51182902Skmacy#include <sys/param.h>
52182902Skmacy#include <sys/systm.h>
53182902Skmacy#include <sys/bus.h>
54182902Skmacy#include <sys/cons.h>	/* cngetc() */
55222813Sattilio#include <sys/cpuset.h>
56182902Skmacy#ifdef GPROF
57182902Skmacy#include <sys/gmon.h>
58182902Skmacy#endif
59182902Skmacy#include <sys/kernel.h>
60182902Skmacy#include <sys/ktr.h>
61182902Skmacy#include <sys/lock.h>
62182902Skmacy#include <sys/malloc.h>
63182902Skmacy#include <sys/memrange.h>
64182902Skmacy#include <sys/mutex.h>
65182902Skmacy#include <sys/pcpu.h>
66182902Skmacy#include <sys/proc.h>
67182902Skmacy#include <sys/sched.h>
68182902Skmacy#include <sys/smp.h>
69182902Skmacy#include <sys/sysctl.h>
70182902Skmacy
71182902Skmacy#include <vm/vm.h>
72182902Skmacy#include <vm/vm_param.h>
73182902Skmacy#include <vm/pmap.h>
74182902Skmacy#include <vm/vm_kern.h>
75182902Skmacy#include <vm/vm_extern.h>
76182902Skmacy#include <vm/vm_page.h>
77182902Skmacy
78214631Sjhb#include <x86/apicreg.h>
79182902Skmacy#include <machine/md_var.h>
80182902Skmacy#include <machine/mp_watchdog.h>
81182902Skmacy#include <machine/pcb.h>
82182902Skmacy#include <machine/psl.h>
83182902Skmacy#include <machine/smp.h>
84182902Skmacy#include <machine/specialreg.h>
85182902Skmacy#include <machine/pcpu.h>
86182902Skmacy
87182902Skmacy
88182902Skmacy
89182902Skmacy#include <machine/xen/xen-os.h>
90186557Skmacy#include <xen/evtchn.h>
91186557Skmacy#include <xen/xen_intr.h>
92186557Skmacy#include <xen/hypervisor.h>
93182902Skmacy#include <xen/interface/vcpu.h>
94182902Skmacy
95182902Skmacy
96182902Skmacyint	mp_naps;		/* # of Applications processors */
97182902Skmacyint	boot_cpu_id = -1;	/* designated BSP */
98182902Skmacy
99182902Skmacyextern	struct pcpu __pcpu[];
100182902Skmacy
101182902Skmacystatic int bootAP;
102182902Skmacystatic union descriptor *bootAPgdt;
103182902Skmacy
104184112Skmacystatic char resched_name[NR_CPUS][15];
105184112Skmacystatic char callfunc_name[NR_CPUS][15];
106182902Skmacy
107182902Skmacy/* Free these after use */
108182902Skmacyvoid *bootstacks[MAXCPU];
109182902Skmacy
110182902Skmacystruct pcb stoppcbs[MAXCPU];
111182902Skmacy
112182902Skmacy/* Variables needed for SMP tlb shootdown. */
113182902Skmacyvm_offset_t smp_tlb_addr1;
114182902Skmacyvm_offset_t smp_tlb_addr2;
115182902Skmacyvolatile int smp_tlb_wait;
116182902Skmacy
117184112Skmacytypedef void call_data_func_t(uintptr_t , uintptr_t);
118184112Skmacy
119182902Skmacystatic u_int logical_cpus;
120222813Sattiliostatic volatile cpuset_t ipi_nmi_pending;
121182902Skmacy
122182902Skmacy/* used to hold the AP's until we are ready to release them */
123182902Skmacystatic struct mtx ap_boot_mtx;
124182902Skmacy
125182902Skmacy/* Set to 1 once we're ready to let the APs out of the pen. */
126182902Skmacystatic volatile int aps_ready = 0;
127182902Skmacy
128182902Skmacy/*
129182902Skmacy * Store data from cpu_add() until later in the boot when we actually setup
130182902Skmacy * the APs.
131182902Skmacy */
132182902Skmacystruct cpu_info {
133182902Skmacy	int	cpu_present:1;
134182902Skmacy	int	cpu_bsp:1;
135182902Skmacy	int	cpu_disabled:1;
136182902Skmacy} static cpu_info[MAX_APIC_ID + 1];
137182902Skmacyint cpu_apic_ids[MAXCPU];
138187966Sbzint apic_cpuids[MAX_APIC_ID + 1];
139182902Skmacy
140182902Skmacy/* Holds pending bitmap based IPIs per CPU */
141182902Skmacystatic volatile u_int cpu_ipi_pending[MAXCPU];
142182902Skmacy
143191759Skmacystatic int cpu_logical;
144191759Skmacystatic int cpu_cores;
145191759Skmacy
146182902Skmacystatic void	assign_cpu_ids(void);
147182902Skmacystatic void	set_interrupt_apic_ids(void);
148182902Skmacyint	start_all_aps(void);
149182902Skmacystatic int	start_ap(int apic_id);
150182902Skmacystatic void	release_aps(void *dummy);
151182902Skmacy
152182902Skmacystatic u_int	hyperthreading_cpus;
153222813Sattiliostatic cpuset_t	hyperthreading_cpus_mask;
154182902Skmacy
155182902Skmacyextern void Xhypervisor_callback(void);
156182902Skmacyextern void failsafe_callback(void);
157184115Skmacyextern void pmap_lazyfix_action(void);
158182902Skmacy
159182902Skmacystruct cpu_group *
160182902Skmacycpu_topo(void)
161182902Skmacy{
162182902Skmacy	if (cpu_cores == 0)
163182902Skmacy		cpu_cores = 1;
164182902Skmacy	if (cpu_logical == 0)
165182902Skmacy		cpu_logical = 1;
166182902Skmacy	if (mp_ncpus % (cpu_cores * cpu_logical) != 0) {
167182902Skmacy		printf("WARNING: Non-uniform processors.\n");
168182902Skmacy		printf("WARNING: Using suboptimal topology.\n");
169182902Skmacy		return (smp_topo_none());
170182902Skmacy	}
171182902Skmacy	/*
172182902Skmacy	 * No multi-core or hyper-threaded.
173182902Skmacy	 */
174182902Skmacy	if (cpu_logical * cpu_cores == 1)
175182902Skmacy		return (smp_topo_none());
176182902Skmacy	/*
177182902Skmacy	 * Only HTT no multi-core.
178182902Skmacy	 */
179182902Skmacy	if (cpu_logical > 1 && cpu_cores == 1)
180182902Skmacy		return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
181182902Skmacy	/*
182182902Skmacy	 * Only multi-core no HTT.
183182902Skmacy	 */
184182902Skmacy	if (cpu_cores > 1 && cpu_logical == 1)
185182902Skmacy		return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0));
186182902Skmacy	/*
187182902Skmacy	 * Both HTT and multi-core.
188182902Skmacy	 */
189182902Skmacy	return (smp_topo_2level(CG_SHARE_NONE, cpu_cores,
190182902Skmacy	    CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
191182902Skmacy}
192182902Skmacy
193182902Skmacy/*
194182902Skmacy * Calculate usable address in base memory for AP trampoline code.
195182902Skmacy */
196182902Skmacyu_int
197182902Skmacymp_bootaddress(u_int basemem)
198182902Skmacy{
199182902Skmacy
200182902Skmacy	return (basemem);
201182902Skmacy}
202182902Skmacy
203182902Skmacyvoid
204182902Skmacycpu_add(u_int apic_id, char boot_cpu)
205182902Skmacy{
206182902Skmacy
207182902Skmacy	if (apic_id > MAX_APIC_ID) {
208182902Skmacy		panic("SMP: APIC ID %d too high", apic_id);
209182902Skmacy		return;
210182902Skmacy	}
211182902Skmacy	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
212182902Skmacy	    apic_id));
213182902Skmacy	cpu_info[apic_id].cpu_present = 1;
214182902Skmacy	if (boot_cpu) {
215182902Skmacy		KASSERT(boot_cpu_id == -1,
216182902Skmacy		    ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
217182902Skmacy		    boot_cpu_id));
218182902Skmacy		boot_cpu_id = apic_id;
219182902Skmacy		cpu_info[apic_id].cpu_bsp = 1;
220182902Skmacy	}
221182902Skmacy	if (mp_ncpus < MAXCPU)
222182902Skmacy		mp_ncpus++;
223182902Skmacy	if (bootverbose)
224182902Skmacy		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
225182902Skmacy		    "AP");
226182902Skmacy}
227182902Skmacy
228182902Skmacyvoid
229182902Skmacycpu_mp_setmaxid(void)
230182902Skmacy{
231182902Skmacy
232182902Skmacy	mp_maxid = MAXCPU - 1;
233182902Skmacy}
234182902Skmacy
235182902Skmacyint
236182902Skmacycpu_mp_probe(void)
237182902Skmacy{
238182902Skmacy
239182902Skmacy	/*
240182902Skmacy	 * Always record BSP in CPU map so that the mbuf init code works
241182902Skmacy	 * correctly.
242182902Skmacy	 */
243222813Sattilio	CPU_SETOF(0, &all_cpus);
244182902Skmacy	if (mp_ncpus == 0) {
245182902Skmacy		/*
246182902Skmacy		 * No CPUs were found, so this must be a UP system.  Setup
247182902Skmacy		 * the variables to represent a system with a single CPU
248182902Skmacy		 * with an id of 0.
249182902Skmacy		 */
250182902Skmacy		mp_ncpus = 1;
251182902Skmacy		return (0);
252182902Skmacy	}
253182902Skmacy
254182902Skmacy	/* At least one CPU was found. */
255182902Skmacy	if (mp_ncpus == 1) {
256182902Skmacy		/*
257182902Skmacy		 * One CPU was found, so this must be a UP system with
258182902Skmacy		 * an I/O APIC.
259182902Skmacy		 */
260182902Skmacy		return (0);
261182902Skmacy	}
262182902Skmacy
263182902Skmacy	/* At least two CPUs were found. */
264182902Skmacy	return (1);
265182902Skmacy}
266182902Skmacy
267182902Skmacy/*
268182902Skmacy * Initialize the IPI handlers and start up the AP's.
269182902Skmacy */
270182902Skmacyvoid
271182902Skmacycpu_mp_start(void)
272182902Skmacy{
273182902Skmacy	int i;
274182902Skmacy
275182902Skmacy	/* Initialize the logical ID to APIC ID table. */
276182902Skmacy	for (i = 0; i < MAXCPU; i++) {
277182902Skmacy		cpu_apic_ids[i] = -1;
278182902Skmacy		cpu_ipi_pending[i] = 0;
279182902Skmacy	}
280182902Skmacy
281182902Skmacy	/* Set boot_cpu_id if needed. */
282182902Skmacy	if (boot_cpu_id == -1) {
283182902Skmacy		boot_cpu_id = PCPU_GET(apic_id);
284182902Skmacy		cpu_info[boot_cpu_id].cpu_bsp = 1;
285182902Skmacy	} else
286182902Skmacy		KASSERT(boot_cpu_id == PCPU_GET(apic_id),
287182902Skmacy		    ("BSP's APIC ID doesn't match boot_cpu_id"));
288182902Skmacy	cpu_apic_ids[0] = boot_cpu_id;
289187966Sbz	apic_cpuids[boot_cpu_id] = 0;
290182902Skmacy
291182902Skmacy	assign_cpu_ids();
292182902Skmacy
293182902Skmacy	/* Start each Application Processor */
294182902Skmacy	start_all_aps();
295182902Skmacy
296182902Skmacy	/* Setup the initial logical CPUs info. */
297222813Sattilio	logical_cpus = 0;
298222813Sattilio	CPU_ZERO(&logical_cpus_mask);
299182902Skmacy	if (cpu_feature & CPUID_HTT)
300182902Skmacy		logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
301182902Skmacy
302182902Skmacy	set_interrupt_apic_ids();
303182902Skmacy}
304182902Skmacy
305182902Skmacy
306184112Skmacystatic void
307184112Skmacyiv_rendezvous(uintptr_t a, uintptr_t b)
308184112Skmacy{
309184115Skmacy	smp_rendezvous_action();
310184112Skmacy}
311184112Skmacy
312184112Skmacystatic void
313184112Skmacyiv_invltlb(uintptr_t a, uintptr_t b)
314184112Skmacy{
315184115Skmacy	xen_tlb_flush();
316184112Skmacy}
317184112Skmacy
318184112Skmacystatic void
319184112Skmacyiv_invlpg(uintptr_t a, uintptr_t b)
320184112Skmacy{
321184115Skmacy	xen_invlpg(a);
322184112Skmacy}
323184112Skmacy
324184112Skmacystatic void
325184112Skmacyiv_invlrng(uintptr_t a, uintptr_t b)
326184112Skmacy{
327184115Skmacy	vm_offset_t start = (vm_offset_t)a;
328184115Skmacy	vm_offset_t end = (vm_offset_t)b;
329184115Skmacy
330184115Skmacy	while (start < end) {
331184115Skmacy		xen_invlpg(start);
332184115Skmacy		start += PAGE_SIZE;
333184115Skmacy	}
334184112Skmacy}
335184112Skmacy
336184115Skmacy
337184112Skmacystatic void
338184112Skmacyiv_invlcache(uintptr_t a, uintptr_t b)
339184112Skmacy{
340184115Skmacy
341184115Skmacy	wbinvd();
342184198Skmacy	atomic_add_int(&smp_tlb_wait, 1);
343184112Skmacy}
344184112Skmacy
345184112Skmacystatic void
346184112Skmacyiv_lazypmap(uintptr_t a, uintptr_t b)
347184112Skmacy{
348184115Skmacy	pmap_lazyfix_action();
349184224Skmacy	atomic_add_int(&smp_tlb_wait, 1);
350184112Skmacy}
351184112Skmacy
352193154Sadrian/*
353193154Sadrian * These start from "IPI offset" APIC_IPI_INTS
354193154Sadrian */
355193154Sadrianstatic call_data_func_t *ipi_vectors[6] =
356184112Skmacy{
357184224Skmacy  iv_rendezvous,
358184224Skmacy  iv_invltlb,
359184224Skmacy  iv_invlpg,
360184224Skmacy  iv_invlrng,
361184224Skmacy  iv_invlcache,
362184224Skmacy  iv_lazypmap,
363184224Skmacy};
364184224Skmacy
365184224Skmacy/*
366184224Skmacy * Reschedule call back. Nothing to do,
367184224Skmacy * all the work is done automatically when
368184224Skmacy * we return from the interrupt.
369184224Skmacy */
370184224Skmacystatic int
371184224Skmacysmp_reschedule_interrupt(void *unused)
372184224Skmacy{
373184198Skmacy	int cpu = PCPU_GET(cpuid);
374184198Skmacy	u_int ipi_bitmap;
375184198Skmacy
376184198Skmacy	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
377184198Skmacy
378184198Skmacy	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
379184198Skmacy#ifdef COUNT_IPIS
380184198Skmacy		(*ipi_preempt_counts[cpu])++;
381184198Skmacy#endif
382184198Skmacy		sched_preempt(curthread);
383184198Skmacy	}
384184198Skmacy
385184198Skmacy	if (ipi_bitmap & (1 << IPI_AST)) {
386184198Skmacy#ifdef COUNT_IPIS
387184198Skmacy		(*ipi_ast_counts[cpu])++;
388184198Skmacy#endif
389184198Skmacy		/* Nothing to do for AST */
390184198Skmacy	}
391184198Skmacy	return (FILTER_HANDLED);
392184112Skmacy}
393184112Skmacy
394184112Skmacystruct _call_data {
395184224Skmacy	uint16_t func_id;
396184224Skmacy	uint16_t wait;
397184112Skmacy	uintptr_t arg1;
398184112Skmacy	uintptr_t arg2;
399184112Skmacy	atomic_t started;
400184112Skmacy	atomic_t finished;
401184112Skmacy};
402184112Skmacy
403184112Skmacystatic struct _call_data *call_data;
404184112Skmacy
405184198Skmacystatic int
406184112Skmacysmp_call_function_interrupt(void *unused)
407184112Skmacy{
408184224Skmacy	call_data_func_t *func;
409184112Skmacy	uintptr_t arg1 = call_data->arg1;
410184112Skmacy	uintptr_t arg2 = call_data->arg2;
411184112Skmacy	int wait = call_data->wait;
412184224Skmacy	atomic_t *started = &call_data->started;
413184224Skmacy	atomic_t *finished = &call_data->finished;
414184112Skmacy
415193154Sadrian	/* We only handle function IPIs, not bitmap IPIs */
416193154Sadrian	if (call_data->func_id < APIC_IPI_INTS || call_data->func_id > IPI_BITMAP_VECTOR)
417184224Skmacy		panic("invalid function id %u", call_data->func_id);
418184224Skmacy
419193154Sadrian	func = ipi_vectors[call_data->func_id - APIC_IPI_INTS];
420184112Skmacy	/*
421184112Skmacy	 * Notify initiating CPU that I've grabbed the data and am
422184112Skmacy	 * about to execute the function
423184112Skmacy	 */
424184112Skmacy	mb();
425184224Skmacy	atomic_inc(started);
426184112Skmacy	/*
427184112Skmacy	 * At this point the info structure may be out of scope unless wait==1
428184112Skmacy	 */
429184112Skmacy	(*func)(arg1, arg2);
430184112Skmacy
431184112Skmacy	if (wait) {
432184112Skmacy		mb();
433184224Skmacy		atomic_inc(finished);
434184112Skmacy	}
435184224Skmacy	atomic_add_int(&smp_tlb_wait, 1);
436184198Skmacy	return (FILTER_HANDLED);
437184112Skmacy}
438184112Skmacy
439184112Skmacy/*
440182902Skmacy * Print various information about the SMP system hardware and setup.
441182902Skmacy */
442182902Skmacyvoid
443182902Skmacycpu_mp_announce(void)
444182902Skmacy{
445182902Skmacy	int i, x;
446182902Skmacy
447182902Skmacy	/* List CPUs */
448182902Skmacy	printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
449182902Skmacy	for (i = 1, x = 0; x <= MAX_APIC_ID; x++) {
450182902Skmacy		if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp)
451182902Skmacy			continue;
452182902Skmacy		if (cpu_info[x].cpu_disabled)
453182902Skmacy			printf("  cpu (AP): APIC ID: %2d (disabled)\n", x);
454182902Skmacy		else {
455182902Skmacy			KASSERT(i < mp_ncpus,
456182902Skmacy			    ("mp_ncpus and actual cpus are out of whack"));
457182902Skmacy			printf(" cpu%d (AP): APIC ID: %2d\n", i++, x);
458182902Skmacy		}
459182902Skmacy	}
460182902Skmacy}
461182902Skmacy
462184112Skmacystatic int
463184112Skmacyxen_smp_intr_init(unsigned int cpu)
464184112Skmacy{
465184112Skmacy	int rc;
466186557Skmacy	unsigned int irq;
467186557Skmacy
468184112Skmacy	per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
469184112Skmacy
470184112Skmacy	sprintf(resched_name[cpu], "resched%u", cpu);
471184112Skmacy	rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
472184112Skmacy				    cpu,
473184112Skmacy				    resched_name[cpu],
474184112Skmacy				    smp_reschedule_interrupt,
475217072Sjhb	    INTR_TYPE_TTY, &irq);
476184112Skmacy
477193082Sadrian	printf("[XEN] IPI cpu=%d irq=%d vector=RESCHEDULE_VECTOR (%d)\n",
478193082Sadrian	    cpu, irq, RESCHEDULE_VECTOR);
479184198Skmacy
480186557Skmacy	per_cpu(resched_irq, cpu) = irq;
481184112Skmacy
482184112Skmacy	sprintf(callfunc_name[cpu], "callfunc%u", cpu);
483184112Skmacy	rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR,
484184112Skmacy				    cpu,
485184112Skmacy				    callfunc_name[cpu],
486184112Skmacy				    smp_call_function_interrupt,
487217072Sjhb	    INTR_TYPE_TTY, &irq);
488184112Skmacy	if (rc < 0)
489184112Skmacy		goto fail;
490186557Skmacy	per_cpu(callfunc_irq, cpu) = irq;
491184112Skmacy
492193082Sadrian	printf("[XEN] IPI cpu=%d irq=%d vector=CALL_FUNCTION_VECTOR (%d)\n",
493193082Sadrian	    cpu, irq, CALL_FUNCTION_VECTOR);
494184198Skmacy
495184198Skmacy
496184112Skmacy	if ((cpu != 0) && ((rc = ap_cpu_initclocks(cpu)) != 0))
497184112Skmacy		goto fail;
498184112Skmacy
499184112Skmacy	return 0;
500184112Skmacy
501184112Skmacy fail:
502184112Skmacy	if (per_cpu(resched_irq, cpu) >= 0)
503186557Skmacy		unbind_from_irqhandler(per_cpu(resched_irq, cpu));
504184112Skmacy	if (per_cpu(callfunc_irq, cpu) >= 0)
505186557Skmacy		unbind_from_irqhandler(per_cpu(callfunc_irq, cpu));
506184112Skmacy	return rc;
507184112Skmacy}
508184112Skmacy
509184198Skmacystatic void
510184198Skmacyxen_smp_intr_init_cpus(void *unused)
511184198Skmacy{
512184198Skmacy	int i;
513184198Skmacy
514184198Skmacy	for (i = 0; i < mp_ncpus; i++)
515184198Skmacy		xen_smp_intr_init(i);
516184198Skmacy}
517184198Skmacy
518182902Skmacy#define MTOPSIZE (1<<(14 + PAGE_SHIFT))
519182902Skmacy
520182902Skmacy/*
521182902Skmacy * AP CPU's call this to initialize themselves.
522182902Skmacy */
523182902Skmacyvoid
524182902Skmacyinit_secondary(void)
525182902Skmacy{
526182902Skmacy	vm_offset_t addr;
527223758Sattilio	u_int	cpuid;
528182902Skmacy	int	gsel_tss;
529182902Skmacy
530182902Skmacy
531182902Skmacy	/* bootAP is set in start_ap() to our ID. */
532182902Skmacy	PCPU_SET(currentldt, _default_ldt);
533182902Skmacy	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
534182902Skmacy#if 0
535182902Skmacy	gdt[bootAP * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
536182902Skmacy#endif
537182902Skmacy	PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */
538182902Skmacy	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
539182902Skmacy	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
540182902Skmacy#if 0
541182902Skmacy	PCPU_SET(tss_gdt, &gdt[bootAP * NGDT + GPROC0_SEL].sd);
542182902Skmacy
543182902Skmacy	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
544182902Skmacy#endif
545182902Skmacy	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
546182902Skmacy
547182902Skmacy	/*
548182902Skmacy	 * Set to a known state:
549182902Skmacy	 * Set by mpboot.s: CR0_PG, CR0_PE
550182902Skmacy	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
551182902Skmacy	 */
552182902Skmacy	/*
553182902Skmacy	 * signal our startup to the BSP.
554182902Skmacy	 */
555182902Skmacy	mp_naps++;
556182902Skmacy
557182902Skmacy	/* Spin until the BSP releases the AP's. */
558182902Skmacy	while (!aps_ready)
559182902Skmacy		ia32_pause();
560182902Skmacy
561182902Skmacy	/* BSP may have changed PTD while we were waiting */
562182902Skmacy	invltlb();
563182902Skmacy	for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE)
564182902Skmacy		invlpg(addr);
565182902Skmacy
566182902Skmacy	/* set up FPU state on the AP */
567189420Sjhb	npxinit();
568182902Skmacy#if 0
569182902Skmacy
570182902Skmacy	/* set up SSE registers */
571182902Skmacy	enable_sse();
572182902Skmacy#endif
573182902Skmacy#if 0 && defined(PAE)
574182902Skmacy	/* Enable the PTE no-execute bit. */
575182902Skmacy	if ((amd_feature & AMDID_NX) != 0) {
576182902Skmacy		uint64_t msr;
577182902Skmacy
578182902Skmacy		msr = rdmsr(MSR_EFER) | EFER_NXE;
579182902Skmacy		wrmsr(MSR_EFER, msr);
580182902Skmacy	}
581182902Skmacy#endif
582182902Skmacy#if 0
583182902Skmacy	/* A quick check from sanity claus */
584182902Skmacy	if (PCPU_GET(apic_id) != lapic_id()) {
585182902Skmacy		printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
586182902Skmacy		printf("SMP: actual apic_id = %d\n", lapic_id());
587182902Skmacy		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
588182902Skmacy		panic("cpuid mismatch! boom!!");
589182902Skmacy	}
590182902Skmacy#endif
591182902Skmacy
592182902Skmacy	/* Initialize curthread. */
593182902Skmacy	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
594182902Skmacy	PCPU_SET(curthread, PCPU_GET(idlethread));
595182902Skmacy
596182902Skmacy	mtx_lock_spin(&ap_boot_mtx);
597182902Skmacy#if 0
598182902Skmacy
599182902Skmacy	/* Init local apic for irq's */
600182902Skmacy	lapic_setup(1);
601182902Skmacy#endif
602182902Skmacy	smp_cpus++;
603182902Skmacy
604223758Sattilio	cpuid = PCPU_GET(cpuid);
605223758Sattilio	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid);
606223758Sattilio	printf("SMP: AP CPU #%d Launched!\n", cpuid);
607182902Skmacy
608182902Skmacy	/* Determine if we are a logical CPU. */
609182902Skmacy	if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0)
610223758Sattilio		CPU_SET(cpuid, &logical_cpus_mask);
611182902Skmacy
612182902Skmacy	/* Determine if we are a hyperthread. */
613182902Skmacy	if (hyperthreading_cpus > 1 &&
614182902Skmacy	    PCPU_GET(apic_id) % hyperthreading_cpus != 0)
615223758Sattilio		CPU_SET(cpuid, &hyperthreading_cpus_mask);
616182902Skmacy#if 0
617182902Skmacy	if (bootverbose)
618182902Skmacy		lapic_dump("AP");
619182902Skmacy#endif
620182902Skmacy	if (smp_cpus == mp_ncpus) {
621182902Skmacy		/* enable IPI's, tlb shootdown, freezes etc */
622182902Skmacy		atomic_store_rel_int(&smp_started, 1);
623182902Skmacy		smp_active = 1;	 /* historic */
624182902Skmacy	}
625182902Skmacy
626182902Skmacy	mtx_unlock_spin(&ap_boot_mtx);
627182902Skmacy
628182902Skmacy	/* wait until all the AP's are up */
629182902Skmacy	while (smp_started == 0)
630182902Skmacy		ia32_pause();
631182902Skmacy
632183131Skmacy	PCPU_SET(curthread, PCPU_GET(idlethread));
633221835Smav
634221835Smav	/* Start per-CPU event timers. */
635221835Smav	cpu_initclocks_ap();
636221835Smav
637182902Skmacy	/* enter the scheduler */
638182902Skmacy	sched_throw(NULL);
639182902Skmacy
640182902Skmacy	panic("scheduler returned us to %s", __func__);
641182902Skmacy	/* NOTREACHED */
642182902Skmacy}
643182902Skmacy
644182902Skmacy/*******************************************************************
645182902Skmacy * local functions and data
646182902Skmacy */
647182902Skmacy
648182902Skmacy/*
649182902Skmacy * We tell the I/O APIC code about all the CPUs we want to receive
650182902Skmacy * interrupts.  If we don't want certain CPUs to receive IRQs we
651182902Skmacy * can simply not tell the I/O APIC code about them in this function.
652182902Skmacy * We also do not tell it about the BSP since it tells itself about
653182902Skmacy * the BSP internally to work with UP kernels and on UP machines.
654182902Skmacy */
655182902Skmacystatic void
656182902Skmacyset_interrupt_apic_ids(void)
657182902Skmacy{
658182902Skmacy	u_int i, apic_id;
659182902Skmacy
660182902Skmacy	for (i = 0; i < MAXCPU; i++) {
661182902Skmacy		apic_id = cpu_apic_ids[i];
662182902Skmacy		if (apic_id == -1)
663182902Skmacy			continue;
664182902Skmacy		if (cpu_info[apic_id].cpu_bsp)
665182902Skmacy			continue;
666182902Skmacy		if (cpu_info[apic_id].cpu_disabled)
667182902Skmacy			continue;
668182902Skmacy
669182902Skmacy		/* Don't let hyperthreads service interrupts. */
670182902Skmacy		if (hyperthreading_cpus > 1 &&
671182902Skmacy		    apic_id % hyperthreading_cpus != 0)
672182902Skmacy			continue;
673182902Skmacy
674182902Skmacy		intr_add_cpu(i);
675182902Skmacy	}
676182902Skmacy}
677182902Skmacy
678182902Skmacy/*
679182902Skmacy * Assign logical CPU IDs to local APICs.
680182902Skmacy */
681182902Skmacystatic void
682182902Skmacyassign_cpu_ids(void)
683182902Skmacy{
684182902Skmacy	u_int i;
685182902Skmacy
686182902Skmacy	/* Check for explicitly disabled CPUs. */
687182902Skmacy	for (i = 0; i <= MAX_APIC_ID; i++) {
688182902Skmacy		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp)
689182902Skmacy			continue;
690182902Skmacy
691182902Skmacy		/* Don't use this CPU if it has been disabled by a tunable. */
692182902Skmacy		if (resource_disabled("lapic", i)) {
693182902Skmacy			cpu_info[i].cpu_disabled = 1;
694182902Skmacy			continue;
695182902Skmacy		}
696182902Skmacy	}
697182902Skmacy
698182902Skmacy	/*
699182902Skmacy	 * Assign CPU IDs to local APIC IDs and disable any CPUs
700182902Skmacy	 * beyond MAXCPU.  CPU 0 has already been assigned to the BSP,
701182902Skmacy	 * so we only have to assign IDs for APs.
702182902Skmacy	 */
703182902Skmacy	mp_ncpus = 1;
704182902Skmacy	for (i = 0; i <= MAX_APIC_ID; i++) {
705182902Skmacy		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp ||
706182902Skmacy		    cpu_info[i].cpu_disabled)
707182902Skmacy			continue;
708182902Skmacy
709182902Skmacy		if (mp_ncpus < MAXCPU) {
710182902Skmacy			cpu_apic_ids[mp_ncpus] = i;
711187966Sbz			apic_cpuids[i] = mp_ncpus;
712182902Skmacy			mp_ncpus++;
713182902Skmacy		} else
714182902Skmacy			cpu_info[i].cpu_disabled = 1;
715182902Skmacy	}
716182902Skmacy	KASSERT(mp_maxid >= mp_ncpus - 1,
717182902Skmacy	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
718182902Skmacy	    mp_ncpus));
719182902Skmacy}
720182902Skmacy
721182902Skmacy/*
722182902Skmacy * start each AP in our list
723182902Skmacy */
724182902Skmacy/* Lowest 1MB is already mapped: don't touch*/
725182902Skmacy#define TMPMAP_START 1
726182902Skmacyint
727182902Skmacystart_all_aps(void)
728182902Skmacy{
729182902Skmacy	int x,apic_id, cpu;
730182902Skmacy	struct pcpu *pc;
731182902Skmacy
732182902Skmacy	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
733182902Skmacy
734182902Skmacy	/* set up temporary P==V mapping for AP boot */
735182902Skmacy	/* XXX this is a hack, we should boot the AP on its own stack/PTD */
736182902Skmacy
737182902Skmacy	/* start each AP */
738182902Skmacy	for (cpu = 1; cpu < mp_ncpus; cpu++) {
739182902Skmacy		apic_id = cpu_apic_ids[cpu];
740182902Skmacy
741182902Skmacy
742182902Skmacy		bootAP = cpu;
743182902Skmacy		bootAPgdt = gdt + (512*cpu);
744182902Skmacy
745182902Skmacy		/* Get per-cpu data */
746182902Skmacy		pc = &__pcpu[bootAP];
747183132Skmacy		pcpu_init(pc, bootAP, sizeof(struct pcpu));
748194784Sjeff		dpcpu_init((void *)kmem_alloc(kernel_map, DPCPU_SIZE), bootAP);
749182902Skmacy		pc->pc_apic_id = cpu_apic_ids[bootAP];
750182902Skmacy		pc->pc_prvspace = pc;
751182902Skmacy		pc->pc_curthread = 0;
752182902Skmacy
753182902Skmacy		gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
754182902Skmacy		gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
755182902Skmacy
756215587Scperciva		PT_SET_MA(bootAPgdt, VTOM(bootAPgdt) | PG_V | PG_RW);
757182902Skmacy		bzero(bootAPgdt, PAGE_SIZE);
758182902Skmacy		for (x = 0; x < NGDT; x++)
759182902Skmacy			ssdtosd(&gdt_segs[x], &bootAPgdt[x].sd);
760182902Skmacy		PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V);
761183345Skmacy#ifdef notyet
762183345Skmacy
763183345Skmacy                if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) {
764183345Skmacy                        apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id);
765183345Skmacy                        acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id);
766183345Skmacy#ifdef CONFIG_ACPI
767183345Skmacy                        if (acpiid != 0xff)
768183345Skmacy                                x86_acpiid_to_apicid[acpiid] = apicid;
769183345Skmacy#endif
770183345Skmacy                }
771183345Skmacy#endif
772183345Skmacy
773182902Skmacy		/* attempt to start the Application Processor */
774182902Skmacy		if (!start_ap(cpu)) {
775182902Skmacy			printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id);
776182902Skmacy			/* better panic as the AP may be running loose */
777182902Skmacy			printf("panic y/n? [y] ");
778182902Skmacy			if (cngetc() != 'n')
779182902Skmacy				panic("bye-bye");
780182902Skmacy		}
781182902Skmacy
782222813Sattilio		CPU_SET(cpu, &all_cpus);	/* record AP in CPU map */
783182902Skmacy	}
784182902Skmacy
785182902Skmacy
786182902Skmacy	pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1);
787182902Skmacy
788182902Skmacy	/* number of APs actually started */
789182902Skmacy	return mp_naps;
790182902Skmacy}
791182902Skmacy
792182902Skmacyextern uint8_t *pcpu_boot_stack;
793182902Skmacyextern trap_info_t trap_table[];
794182902Skmacy
795182902Skmacystatic void
796182902Skmacysmp_trap_init(trap_info_t *trap_ctxt)
797182902Skmacy{
798182902Skmacy        const trap_info_t *t = trap_table;
799182902Skmacy
800182902Skmacy        for (t = trap_table; t->address; t++) {
801182902Skmacy                trap_ctxt[t->vector].flags = t->flags;
802182902Skmacy                trap_ctxt[t->vector].cs = t->cs;
803182902Skmacy                trap_ctxt[t->vector].address = t->address;
804182902Skmacy        }
805182902Skmacy}
806182902Skmacy
807182902Skmacyextern int nkpt;
808184112Skmacystatic void
809182902Skmacycpu_initialize_context(unsigned int cpu)
810182902Skmacy{
811182902Skmacy	/* vcpu_guest_context_t is too large to allocate on the stack.
812182902Skmacy	 * Hence we allocate statically and protect it with a lock */
813182902Skmacy	vm_page_t m[4];
814182902Skmacy	static vcpu_guest_context_t ctxt;
815182902Skmacy	vm_offset_t boot_stack;
816183131Skmacy	vm_offset_t newPTD;
817183131Skmacy	vm_paddr_t ma[NPGPTD];
818182902Skmacy	static int color;
819182902Skmacy	int i;
820182902Skmacy
821182902Skmacy	/*
822183131Skmacy	 * Page 0,[0-3]	PTD
823183131Skmacy	 * Page 1, [4]	boot stack
824183131Skmacy	 * Page [5]	PDPT
825182902Skmacy	 *
826182902Skmacy	 */
827183131Skmacy	for (i = 0; i < NPGPTD + 2; i++) {
828182902Skmacy		m[i] = vm_page_alloc(NULL, color++,
829182902Skmacy		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
830182902Skmacy		    VM_ALLOC_ZERO);
831182902Skmacy
832182902Skmacy		pmap_zero_page(m[i]);
833182902Skmacy
834182902Skmacy	}
835183131Skmacy	boot_stack = kmem_alloc_nofault(kernel_map, 1);
836183131Skmacy	newPTD = kmem_alloc_nofault(kernel_map, NPGPTD);
837215587Scperciva	ma[0] = VM_PAGE_TO_MACH(m[0])|PG_V;
838182902Skmacy
839183131Skmacy#ifdef PAE
840183131Skmacy	pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD + 1]));
841183131Skmacy	for (i = 0; i < NPGPTD; i++) {
842183131Skmacy		((vm_paddr_t *)boot_stack)[i] =
843215587Scperciva		ma[i] = VM_PAGE_TO_MACH(m[i])|PG_V;
844182902Skmacy	}
845183131Skmacy#endif
846182902Skmacy
847182902Skmacy	/*
848182902Skmacy	 * Copy cpu0 IdlePTD to new IdlePTD - copying only
849182902Skmacy	 * kernel mappings
850182902Skmacy	 */
851183131Skmacy	pmap_qenter(newPTD, m, 4);
852183131Skmacy
853183131Skmacy	memcpy((uint8_t *)newPTD + KPTDI*sizeof(vm_paddr_t),
854183131Skmacy	    (uint8_t *)PTOV(IdlePTD) + KPTDI*sizeof(vm_paddr_t),
855182902Skmacy	    nkpt*sizeof(vm_paddr_t));
856183131Skmacy
857183131Skmacy	pmap_qremove(newPTD, 4);
858183131Skmacy	kmem_free(kernel_map, newPTD, 4);
859182902Skmacy	/*
860182902Skmacy	 * map actual idle stack to boot_stack
861182902Skmacy	 */
862183131Skmacy	pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD]));
863182902Skmacy
864182902Skmacy
865215587Scperciva	xen_pgdpt_pin(VM_PAGE_TO_MACH(m[NPGPTD + 1]));
866182902Skmacy	vm_page_lock_queues();
867182902Skmacy	for (i = 0; i < 4; i++) {
868183131Skmacy		int pdir = (PTDPTDI + i) / NPDEPG;
869183131Skmacy		int curoffset = (PTDPTDI + i) % NPDEPG;
870183131Skmacy
871182902Skmacy		xen_queue_pt_update((vm_paddr_t)
872183131Skmacy		    ((ma[pdir] & ~PG_V) + (curoffset*sizeof(vm_paddr_t))),
873182902Skmacy		    ma[i]);
874182902Skmacy	}
875182902Skmacy	PT_UPDATES_FLUSH();
876182902Skmacy	vm_page_unlock_queues();
877182902Skmacy
878182902Skmacy	memset(&ctxt, 0, sizeof(ctxt));
879182902Skmacy	ctxt.flags = VGCF_IN_KERNEL;
880182902Skmacy	ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL);
881182902Skmacy	ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL);
882182902Skmacy	ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL);
883182902Skmacy	ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL);
884182902Skmacy	ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL);
885182902Skmacy	ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL);
886182902Skmacy	ctxt.user_regs.eip = (unsigned long)init_secondary;
887182902Skmacy	ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */
888182902Skmacy
889182902Skmacy	memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
890182902Skmacy
891182902Skmacy	smp_trap_init(ctxt.trap_ctxt);
892182902Skmacy
893182902Skmacy	ctxt.ldt_ents = 0;
894182902Skmacy	ctxt.gdt_frames[0] = (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT);
895182902Skmacy	ctxt.gdt_ents      = 512;
896182902Skmacy
897182902Skmacy#ifdef __i386__
898182902Skmacy	ctxt.user_regs.esp = boot_stack + PAGE_SIZE;
899182902Skmacy
900182902Skmacy	ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL);
901182902Skmacy	ctxt.kernel_sp = boot_stack + PAGE_SIZE;
902182902Skmacy
903182902Skmacy	ctxt.event_callback_cs     = GSEL(GCODE_SEL, SEL_KPL);
904182902Skmacy	ctxt.event_callback_eip    = (unsigned long)Xhypervisor_callback;
905182902Skmacy	ctxt.failsafe_callback_cs  = GSEL(GCODE_SEL, SEL_KPL);
906182902Skmacy	ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
907182902Skmacy
908215587Scperciva	ctxt.ctrlreg[3] = VM_PAGE_TO_MACH(m[NPGPTD + 1]);
909182902Skmacy#else /* __x86_64__ */
910182902Skmacy	ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
911182902Skmacy	ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL);
912182902Skmacy	ctxt.kernel_sp = idle->thread.rsp0;
913182902Skmacy
914182902Skmacy	ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
915182902Skmacy	ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
916182902Skmacy	ctxt.syscall_callback_eip  = (unsigned long)system_call;
917182902Skmacy
918182902Skmacy	ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
919182902Skmacy
920182902Skmacy	ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu));
921182902Skmacy#endif
922182902Skmacy
923182902Skmacy	printf("gdtpfn=%lx pdptpfn=%lx\n",
924182902Skmacy	    ctxt.gdt_frames[0],
925182902Skmacy	    ctxt.ctrlreg[3] >> PAGE_SHIFT);
926182902Skmacy
927182902Skmacy	PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt));
928182902Skmacy	DELAY(3000);
929182902Skmacy	PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL));
930182902Skmacy}
931182902Skmacy
932182902Skmacy/*
933182902Skmacy * This function starts the AP (application processor) identified
934182902Skmacy * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
935182902Skmacy * to accomplish this.  This is necessary because of the nuances
936182902Skmacy * of the different hardware we might encounter.  It isn't pretty,
937182902Skmacy * but it seems to work.
938182902Skmacy */
939183131Skmacy
940183131Skmacyint cpus;
941182902Skmacystatic int
942182902Skmacystart_ap(int apic_id)
943182902Skmacy{
944182902Skmacy	int ms;
945182902Skmacy
946182902Skmacy	/* used as a watchpoint to signal AP startup */
947182902Skmacy	cpus = mp_naps;
948182902Skmacy
949182902Skmacy	cpu_initialize_context(apic_id);
950182902Skmacy
951182902Skmacy	/* Wait up to 5 seconds for it to start. */
952182902Skmacy	for (ms = 0; ms < 5000; ms++) {
953182902Skmacy		if (mp_naps > cpus)
954182902Skmacy			return 1;	/* return SUCCESS */
955182902Skmacy		DELAY(1000);
956182902Skmacy	}
957182902Skmacy	return 0;		/* return FAILURE */
958182902Skmacy}
959182902Skmacy
960182902Skmacy/*
961222065Sattilio * send an IPI to a specific CPU.
962222065Sattilio */
963222065Sattiliostatic void
964222065Sattilioipi_send_cpu(int cpu, u_int ipi)
965222065Sattilio{
966222065Sattilio	u_int bitmap, old_pending, new_pending;
967222065Sattilio
968222065Sattilio	if (IPI_IS_BITMAPED(ipi)) {
969222065Sattilio		bitmap = 1 << ipi;
970222065Sattilio		ipi = IPI_BITMAP_VECTOR;
971222065Sattilio		do {
972222065Sattilio			old_pending = cpu_ipi_pending[cpu];
973222065Sattilio			new_pending = old_pending | bitmap;
974222065Sattilio		} while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],
975222065Sattilio		    old_pending, new_pending));
976222065Sattilio		if (!old_pending)
977222065Sattilio			ipi_pcpu(cpu, RESCHEDULE_VECTOR);
978222065Sattilio	} else {
979222065Sattilio		KASSERT(call_data != NULL, ("call_data not set"));
980222065Sattilio		ipi_pcpu(cpu, CALL_FUNCTION_VECTOR);
981222065Sattilio	}
982222065Sattilio}
983222065Sattilio
984222065Sattilio/*
985182902Skmacy * Flush the TLB on all other CPU's
986182902Skmacy */
987182902Skmacystatic void
988182902Skmacysmp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
989182902Skmacy{
990182902Skmacy	u_int ncpu;
991184198Skmacy	struct _call_data data;
992182902Skmacy
993182902Skmacy	ncpu = mp_ncpus - 1;	/* does not shootdown self */
994182902Skmacy	if (ncpu < 1)
995182902Skmacy		return;		/* no other cpus */
996182902Skmacy	if (!(read_eflags() & PSL_I))
997182902Skmacy		panic("%s: interrupts disabled", __func__);
998182902Skmacy	mtx_lock_spin(&smp_ipi_mtx);
999193098Sadrian	KASSERT(call_data == NULL, ("call_data isn't null?!"));
1000193098Sadrian	call_data = &data;
1001184224Skmacy	call_data->func_id = vector;
1002184112Skmacy	call_data->arg1 = addr1;
1003184112Skmacy	call_data->arg2 = addr2;
1004182902Skmacy	atomic_store_rel_int(&smp_tlb_wait, 0);
1005182902Skmacy	ipi_all_but_self(vector);
1006182902Skmacy	while (smp_tlb_wait < ncpu)
1007182902Skmacy		ia32_pause();
1008184224Skmacy	call_data = NULL;
1009182902Skmacy	mtx_unlock_spin(&smp_ipi_mtx);
1010182902Skmacy}
1011182902Skmacy
1012182902Skmacystatic void
1013222813Sattiliosmp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
1014182902Skmacy{
1015222813Sattilio	int cpu, ncpu, othercpus;
1016184224Skmacy	struct _call_data data;
1017182902Skmacy
1018182902Skmacy	othercpus = mp_ncpus - 1;
1019222813Sattilio	if (CPU_ISFULLSET(&mask)) {
1020222813Sattilio		if (othercpus < 1)
1021182902Skmacy			return;
1022182902Skmacy	} else {
1023223758Sattilio		CPU_CLR(PCPU_GET(cpuid), &mask);
1024222813Sattilio		if (CPU_EMPTY(&mask))
1025182902Skmacy			return;
1026182902Skmacy	}
1027182902Skmacy	if (!(read_eflags() & PSL_I))
1028182902Skmacy		panic("%s: interrupts disabled", __func__);
1029182902Skmacy	mtx_lock_spin(&smp_ipi_mtx);
1030193098Sadrian	KASSERT(call_data == NULL, ("call_data isn't null?!"));
1031184224Skmacy	call_data = &data;
1032184224Skmacy	call_data->func_id = vector;
1033184224Skmacy	call_data->arg1 = addr1;
1034184224Skmacy	call_data->arg2 = addr2;
1035182902Skmacy	atomic_store_rel_int(&smp_tlb_wait, 0);
1036222813Sattilio	if (CPU_ISFULLSET(&mask)) {
1037222813Sattilio		ncpu = othercpus;
1038182902Skmacy		ipi_all_but_self(vector);
1039222813Sattilio	} else {
1040222813Sattilio		ncpu = 0;
1041256207Smav		while ((cpu = CPU_FFS(&mask)) != 0) {
1042222813Sattilio			cpu--;
1043222813Sattilio			CPU_CLR(cpu, &mask);
1044222813Sattilio			CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu,
1045222813Sattilio			    vector);
1046222813Sattilio			ipi_send_cpu(cpu, vector);
1047222813Sattilio			ncpu++;
1048222813Sattilio		}
1049222813Sattilio	}
1050182902Skmacy	while (smp_tlb_wait < ncpu)
1051182902Skmacy		ia32_pause();
1052184224Skmacy	call_data = NULL;
1053182902Skmacy	mtx_unlock_spin(&smp_ipi_mtx);
1054182902Skmacy}
1055182902Skmacy
1056182902Skmacyvoid
1057182902Skmacysmp_cache_flush(void)
1058182902Skmacy{
1059182902Skmacy
1060182902Skmacy	if (smp_started)
1061182902Skmacy		smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
1062182902Skmacy}
1063182902Skmacy
1064182902Skmacyvoid
1065182902Skmacysmp_invltlb(void)
1066182902Skmacy{
1067182902Skmacy
1068182902Skmacy	if (smp_started) {
1069182902Skmacy		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
1070182902Skmacy	}
1071182902Skmacy}
1072182902Skmacy
1073182902Skmacyvoid
1074182902Skmacysmp_invlpg(vm_offset_t addr)
1075182902Skmacy{
1076182902Skmacy
1077182902Skmacy	if (smp_started) {
1078182902Skmacy		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
1079182902Skmacy	}
1080182902Skmacy}
1081182902Skmacy
1082182902Skmacyvoid
1083182902Skmacysmp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
1084182902Skmacy{
1085182902Skmacy
1086182902Skmacy	if (smp_started) {
1087182902Skmacy		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
1088182902Skmacy	}
1089182902Skmacy}
1090182902Skmacy
1091182902Skmacyvoid
1092222813Sattiliosmp_masked_invltlb(cpuset_t mask)
1093182902Skmacy{
1094182902Skmacy
1095182902Skmacy	if (smp_started) {
1096182902Skmacy		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
1097182902Skmacy	}
1098182902Skmacy}
1099182902Skmacy
1100182902Skmacyvoid
1101222813Sattiliosmp_masked_invlpg(cpuset_t mask, vm_offset_t addr)
1102182902Skmacy{
1103182902Skmacy
1104182902Skmacy	if (smp_started) {
1105182902Skmacy		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
1106182902Skmacy	}
1107182902Skmacy}
1108182902Skmacy
1109182902Skmacyvoid
1110222813Sattiliosmp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2)
1111182902Skmacy{
1112182902Skmacy
1113182902Skmacy	if (smp_started) {
1114182902Skmacy		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
1115182902Skmacy	}
1116182902Skmacy}
1117182902Skmacy
1118182902Skmacy/*
1119182902Skmacy * send an IPI to a set of cpus.
1120182902Skmacy */
1121182902Skmacyvoid
1122222813Sattilioipi_selected(cpuset_t cpus, u_int ipi)
1123182902Skmacy{
1124182902Skmacy	int cpu;
1125182902Skmacy
1126196256Sattilio	/*
1127196256Sattilio	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1128196256Sattilio	 * of help in order to understand what is the source.
1129196256Sattilio	 * Set the mask of receiving CPUs for this purpose.
1130196256Sattilio	 */
1131196256Sattilio	if (ipi == IPI_STOP_HARD)
1132222813Sattilio		CPU_OR_ATOMIC(&ipi_nmi_pending, &cpus);
1133196256Sattilio
1134256207Smav	while ((cpu = CPU_FFS(&cpus)) != 0) {
1135182902Skmacy		cpu--;
1136222813Sattilio		CPU_CLR(cpu, &cpus);
1137222065Sattilio		CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1138222065Sattilio		ipi_send_cpu(cpu, ipi);
1139182902Skmacy	}
1140182902Skmacy}
1141182902Skmacy
1142182902Skmacy/*
1143210939Sjhb * send an IPI to a specific CPU.
1144210939Sjhb */
1145210939Sjhbvoid
1146210939Sjhbipi_cpu(int cpu, u_int ipi)
1147210939Sjhb{
1148210939Sjhb
1149210939Sjhb	/*
1150210939Sjhb	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1151210939Sjhb	 * of help in order to understand what is the source.
1152210939Sjhb	 * Set the mask of receiving CPUs for this purpose.
1153210939Sjhb	 */
1154210939Sjhb	if (ipi == IPI_STOP_HARD)
1155222813Sattilio		CPU_SET_ATOMIC(cpu, &ipi_nmi_pending);
1156210939Sjhb
1157210939Sjhb	CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1158222065Sattilio	ipi_send_cpu(cpu, ipi);
1159210939Sjhb}
1160210939Sjhb
1161210939Sjhb/*
1162182902Skmacy * send an IPI to all CPUs EXCEPT myself
1163182902Skmacy */
1164182902Skmacyvoid
1165182902Skmacyipi_all_but_self(u_int ipi)
1166182902Skmacy{
1167222813Sattilio	cpuset_t other_cpus;
1168196256Sattilio
1169196256Sattilio	/*
1170196256Sattilio	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1171196256Sattilio	 * of help in order to understand what is the source.
1172196256Sattilio	 * Set the mask of receiving CPUs for this purpose.
1173196256Sattilio	 */
1174223758Sattilio	other_cpus = all_cpus;
1175223758Sattilio	CPU_CLR(PCPU_GET(cpuid), &other_cpus);
1176196256Sattilio	if (ipi == IPI_STOP_HARD)
1177222813Sattilio		CPU_OR_ATOMIC(&ipi_nmi_pending, &other_cpus);
1178196256Sattilio
1179182902Skmacy	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1180222813Sattilio	ipi_selected(other_cpus, ipi);
1181182902Skmacy}
1182182902Skmacy
1183196256Sattilioint
1184196256Sattilioipi_nmi_handler()
1185196256Sattilio{
1186223758Sattilio	u_int cpuid;
1187196256Sattilio
1188196256Sattilio	/*
1189196256Sattilio	 * As long as there is not a simple way to know about a NMI's
1190196256Sattilio	 * source, if the bitmask for the current CPU is present in
1191196256Sattilio	 * the global pending bitword an IPI_STOP_HARD has been issued
1192196256Sattilio	 * and should be handled.
1193196256Sattilio	 */
1194223758Sattilio	cpuid = PCPU_GET(cpuid);
1195223758Sattilio	if (!CPU_ISSET(cpuid, &ipi_nmi_pending))
1196196256Sattilio		return (1);
1197196256Sattilio
1198223758Sattilio	CPU_CLR_ATOMIC(cpuid, &ipi_nmi_pending);
1199196256Sattilio	cpustop_handler();
1200196256Sattilio	return (0);
1201196256Sattilio}
1202196256Sattilio
1203182902Skmacy/*
1204182902Skmacy * Handle an IPI_STOP by saving our current context and spinning until we
1205182902Skmacy * are resumed.
1206182902Skmacy */
1207182902Skmacyvoid
1208182902Skmacycpustop_handler(void)
1209182902Skmacy{
1210222813Sattilio	int cpu;
1211182902Skmacy
1212222813Sattilio	cpu = PCPU_GET(cpuid);
1213222813Sattilio
1214182902Skmacy	savectx(&stoppcbs[cpu]);
1215182902Skmacy
1216182902Skmacy	/* Indicate that we are stopped */
1217223758Sattilio	CPU_SET_ATOMIC(cpu, &stopped_cpus);
1218182902Skmacy
1219182902Skmacy	/* Wait for restart */
1220223758Sattilio	while (!CPU_ISSET(cpu, &started_cpus))
1221182902Skmacy	    ia32_pause();
1222182902Skmacy
1223223758Sattilio	CPU_CLR_ATOMIC(cpu, &started_cpus);
1224223758Sattilio	CPU_CLR_ATOMIC(cpu, &stopped_cpus);
1225182902Skmacy
1226182902Skmacy	if (cpu == 0 && cpustop_restartfunc != NULL) {
1227182902Skmacy		cpustop_restartfunc();
1228182902Skmacy		cpustop_restartfunc = NULL;
1229182902Skmacy	}
1230182902Skmacy}
1231182902Skmacy
1232182902Skmacy/*
1233182902Skmacy * This is called once the rest of the system is up and running and we're
1234182902Skmacy * ready to let the AP's out of the pen.
1235182902Skmacy */
1236182902Skmacystatic void
1237182902Skmacyrelease_aps(void *dummy __unused)
1238182902Skmacy{
1239182902Skmacy
1240182902Skmacy	if (mp_ncpus == 1)
1241182902Skmacy		return;
1242182902Skmacy	atomic_store_rel_int(&aps_ready, 1);
1243182902Skmacy	while (smp_started == 0)
1244182902Skmacy		ia32_pause();
1245182902Skmacy}
1246182902SkmacySYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1247184198SkmacySYSINIT(start_ipis, SI_SUB_INTR, SI_ORDER_ANY, xen_smp_intr_init_cpus, NULL);
1248182902Skmacy
1249