mp_machdep.c revision 204972
1132332Smarcel/*-
2132332Smarcel * Copyright (c) 1996, by Steve Passe
3132332Smarcel * Copyright (c) 2008, by Kip Macy
4132332Smarcel * All rights reserved.
5132332Smarcel *
6132332Smarcel * Redistribution and use in source and binary forms, with or without
7132332Smarcel * modification, are permitted provided that the following conditions
8132332Smarcel * are met:
9132332Smarcel * 1. Redistributions of source code must retain the above copyright
10132332Smarcel *    notice, this list of conditions and the following disclaimer.
11132332Smarcel * 2. The name of the developer may NOT be used to endorse or promote products
12132332Smarcel *    derived from this software without specific prior written permission.
13132332Smarcel *
14132332Smarcel * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15132332Smarcel * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16132332Smarcel * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17132332Smarcel * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18132332Smarcel * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19132332Smarcel * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20132332Smarcel * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21132332Smarcel * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22132332Smarcel * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23132332Smarcel * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24132332Smarcel * SUCH DAMAGE.
25132332Smarcel */
26132332Smarcel
27132332Smarcel#include <sys/cdefs.h>
28132332Smarcel__FBSDID("$FreeBSD: head/sys/i386/xen/mp_machdep.c 204972 2010-03-10 19:50:52Z jhb $");
29132332Smarcel
30132332Smarcel#include "opt_apic.h"
31132332Smarcel#include "opt_cpu.h"
32132332Smarcel#include "opt_kstack_pages.h"
33132332Smarcel#include "opt_mp_watchdog.h"
34132332Smarcel#include "opt_pmap.h"
35132332Smarcel#include "opt_sched.h"
36177490Sdavidxu#include "opt_smp.h"
37132332Smarcel
38132332Smarcel#if !defined(lint)
39132332Smarcel#if !defined(SMP)
40132332Smarcel#error How did you get here?
41132332Smarcel#endif
42177526Sjeff
43132332Smarcel#ifndef DEV_APIC
44132332Smarcel#error The apic device is required for SMP, add "device apic" to your config file.
45132332Smarcel#endif
46132332Smarcel#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT)
47132332Smarcel#error SMP not supported with CPU_DISABLE_CMPXCHG
48132332Smarcel#endif
49132332Smarcel#endif /* not lint */
50132332Smarcel
51132332Smarcel#include <sys/param.h>
52132332Smarcel#include <sys/systm.h>
53132332Smarcel#include <sys/bus.h>
54132332Smarcel#include <sys/cons.h>	/* cngetc() */
55132332Smarcel#ifdef GPROF
56132332Smarcel#include <sys/gmon.h>
57132332Smarcel#endif
58132332Smarcel#include <sys/kernel.h>
59132332Smarcel#include <sys/ktr.h>
60132332Smarcel#include <sys/lock.h>
61132332Smarcel#include <sys/malloc.h>
62132332Smarcel#include <sys/memrange.h>
63132332Smarcel#include <sys/mutex.h>
64132332Smarcel#include <sys/pcpu.h>
65132332Smarcel#include <sys/proc.h>
66132332Smarcel#include <sys/sched.h>
67132332Smarcel#include <sys/smp.h>
68132332Smarcel#include <sys/sysctl.h>
69132332Smarcel
70132332Smarcel#include <vm/vm.h>
71132332Smarcel#include <vm/vm_param.h>
72132332Smarcel#include <vm/pmap.h>
73181059Smarcel#include <vm/vm_kern.h>
74132332Smarcel#include <vm/vm_extern.h>
75132332Smarcel#include <vm/vm_page.h>
76132332Smarcel
77132332Smarcel#include <machine/apicreg.h>
78132332Smarcel#include <machine/md_var.h>
79132332Smarcel#include <machine/mp_watchdog.h>
80132332Smarcel#include <machine/pcb.h>
81132332Smarcel#include <machine/psl.h>
82132332Smarcel#include <machine/smp.h>
83132332Smarcel#include <machine/specialreg.h>
84132332Smarcel#include <machine/pcpu.h>
85132332Smarcel
86132332Smarcel
87132332Smarcel
88132332Smarcel#include <machine/xen/xen-os.h>
89132332Smarcel#include <xen/evtchn.h>
90132332Smarcel#include <xen/xen_intr.h>
91132332Smarcel#include <xen/hypervisor.h>
92132332Smarcel#include <xen/interface/vcpu.h>
93132332Smarcel
94132332Smarcel
95132332Smarcelint	mp_naps;		/* # of Applications processors */
96132332Smarcelint	boot_cpu_id = -1;	/* designated BSP */
97132332Smarcel
98132332Smarcelextern	struct pcpu __pcpu[];
99132332Smarcel
100132332Smarcelstatic int bootAP;
101132332Smarcelstatic union descriptor *bootAPgdt;
102132332Smarcel
103132332Smarcelstatic char resched_name[NR_CPUS][15];
104132332Smarcelstatic char callfunc_name[NR_CPUS][15];
105132332Smarcel
106132332Smarcel/* Free these after use */
107132332Smarcelvoid *bootstacks[MAXCPU];
108132332Smarcel
109132332Smarcelstruct pcb stoppcbs[MAXCPU];
110132332Smarcel
111132332Smarcel/* Variables needed for SMP tlb shootdown. */
112132332Smarcelvm_offset_t smp_tlb_addr1;
113132332Smarcelvm_offset_t smp_tlb_addr2;
114132332Smarcelvolatile int smp_tlb_wait;
115132332Smarcel
116132332Smarceltypedef void call_data_func_t(uintptr_t , uintptr_t);
117132332Smarcel
118132332Smarcelstatic u_int logical_cpus;
119132332Smarcelstatic volatile cpumask_t ipi_nmi_pending;
120132332Smarcel
121132332Smarcel/* used to hold the AP's until we are ready to release them */
122132332Smarcelstatic struct mtx ap_boot_mtx;
123132332Smarcel
124132332Smarcel/* Set to 1 once we're ready to let the APs out of the pen. */
125132332Smarcelstatic volatile int aps_ready = 0;
126132332Smarcel
127132332Smarcel/*
128132332Smarcel * Store data from cpu_add() until later in the boot when we actually setup
129132332Smarcel * the APs.
130132332Smarcel */
131132332Smarcelstruct cpu_info {
132133802Sdavidxu	int	cpu_present:1;
133133802Sdavidxu	int	cpu_bsp:1;
134133802Sdavidxu	int	cpu_disabled:1;
135133802Sdavidxu} static cpu_info[MAX_APIC_ID + 1];
136133802Sdavidxuint cpu_apic_ids[MAXCPU];
137133802Sdavidxuint apic_cpuids[MAX_APIC_ID + 1];
138133802Sdavidxu
139133802Sdavidxu/* Holds pending bitmap based IPIs per CPU */
140133802Sdavidxustatic volatile u_int cpu_ipi_pending[MAXCPU];
141133802Sdavidxu
142133802Sdavidxustatic int cpu_logical;
143133802Sdavidxustatic int cpu_cores;
144133802Sdavidxu
145133802Sdavidxustatic void	assign_cpu_ids(void);
146132332Smarcelstatic void	set_interrupt_apic_ids(void);
147133802Sdavidxuint	start_all_aps(void);
148132332Smarcelstatic int	start_ap(int apic_id);
149132332Smarcelstatic void	release_aps(void *dummy);
150132332Smarcel
151132332Smarcelstatic u_int	hyperthreading_cpus;
152132332Smarcelstatic cpumask_t	hyperthreading_cpus_mask;
153132332Smarcel
154132332Smarcelextern void Xhypervisor_callback(void);
155132332Smarcelextern void failsafe_callback(void);
156132332Smarcelextern void pmap_lazyfix_action(void);
157132332Smarcel
158132332Smarcelstruct cpu_group *
159132332Smarcelcpu_topo(void)
160132332Smarcel{
161132332Smarcel	if (cpu_cores == 0)
162132332Smarcel		cpu_cores = 1;
163132332Smarcel	if (cpu_logical == 0)
164132332Smarcel		cpu_logical = 1;
165132332Smarcel	if (mp_ncpus % (cpu_cores * cpu_logical) != 0) {
166132332Smarcel		printf("WARNING: Non-uniform processors.\n");
167133802Sdavidxu		printf("WARNING: Using suboptimal topology.\n");
168133802Sdavidxu		return (smp_topo_none());
169133802Sdavidxu	}
170133802Sdavidxu	/*
171133802Sdavidxu	 * No multi-core or hyper-threaded.
172133802Sdavidxu	 */
173133802Sdavidxu	if (cpu_logical * cpu_cores == 1)
174133802Sdavidxu		return (smp_topo_none());
175133802Sdavidxu	/*
176133802Sdavidxu	 * Only HTT no multi-core.
177133802Sdavidxu	 */
178133802Sdavidxu	if (cpu_logical > 1 && cpu_cores == 1)
179133802Sdavidxu		return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
180133802Sdavidxu	/*
181133802Sdavidxu	 * Only multi-core no HTT.
182133802Sdavidxu	 */
183133802Sdavidxu	if (cpu_cores > 1 && cpu_logical == 1)
184158680Sdavidxu		return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0));
185158680Sdavidxu	/*
186132332Smarcel	 * Both HTT and multi-core.
187132332Smarcel	 */
188132332Smarcel	return (smp_topo_2level(CG_SHARE_NONE, cpu_cores,
189132332Smarcel	    CG_SHARE_L1, cpu_logical, CG_FLAG_HTT));
190132332Smarcel}
191132332Smarcel
192132332Smarcel/*
193132332Smarcel * Calculate usable address in base memory for AP trampoline code.
194132332Smarcel */
195132332Smarcelu_int
196132332Smarcelmp_bootaddress(u_int basemem)
197132332Smarcel{
198132332Smarcel
199132332Smarcel	return (basemem);
200132332Smarcel}
201132332Smarcel
202132332Smarcelvoid
203132332Smarcelcpu_add(u_int apic_id, char boot_cpu)
204132332Smarcel{
205132332Smarcel
206132332Smarcel	if (apic_id > MAX_APIC_ID) {
207132332Smarcel		panic("SMP: APIC ID %d too high", apic_id);
208132332Smarcel		return;
209132332Smarcel	}
210132332Smarcel	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
211132332Smarcel	    apic_id));
212132332Smarcel	cpu_info[apic_id].cpu_present = 1;
213132332Smarcel	if (boot_cpu) {
214132332Smarcel		KASSERT(boot_cpu_id == -1,
215132332Smarcel		    ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
216132332Smarcel		    boot_cpu_id));
217132332Smarcel		boot_cpu_id = apic_id;
218132332Smarcel		cpu_info[apic_id].cpu_bsp = 1;
219132332Smarcel	}
220132332Smarcel	if (mp_ncpus < MAXCPU)
221132332Smarcel		mp_ncpus++;
222132332Smarcel	if (bootverbose)
223132332Smarcel		printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
224132332Smarcel		    "AP");
225132332Smarcel}
226132332Smarcel
227132332Smarcelvoid
228132332Smarcelcpu_mp_setmaxid(void)
229132332Smarcel{
230132332Smarcel
231183021Smarcel	mp_maxid = MAXCPU - 1;
232183021Smarcel}
233132332Smarcel
234183021Smarcelint
235132332Smarcelcpu_mp_probe(void)
236132332Smarcel{
237132332Smarcel
238132332Smarcel	/*
239132332Smarcel	 * Always record BSP in CPU map so that the mbuf init code works
240132332Smarcel	 * correctly.
241183021Smarcel	 */
242183021Smarcel	all_cpus = 1;
243132332Smarcel	if (mp_ncpus == 0) {
244183021Smarcel		/*
245183021Smarcel		 * No CPUs were found, so this must be a UP system.  Setup
246183021Smarcel		 * the variables to represent a system with a single CPU
247132332Smarcel		 * with an id of 0.
248183021Smarcel		 */
249132332Smarcel		mp_ncpus = 1;
250132332Smarcel		return (0);
251132332Smarcel	}
252132332Smarcel
253132332Smarcel	/* At least one CPU was found. */
254132332Smarcel	if (mp_ncpus == 1) {
255132332Smarcel		/*
256132332Smarcel		 * One CPU was found, so this must be a UP system with
257132332Smarcel		 * an I/O APIC.
258183021Smarcel		 */
259132332Smarcel		return (0);
260183021Smarcel	}
261132332Smarcel
262132332Smarcel	/* At least two CPUs were found. */
263155411Sdavidxu	return (1);
264155411Sdavidxu}
265132332Smarcel
266132332Smarcel/*
267132332Smarcel * Initialize the IPI handlers and start up the AP's.
268132332Smarcel */
269132332Smarcelvoid
270132332Smarcelcpu_mp_start(void)
271183021Smarcel{
272183021Smarcel	int i;
273132332Smarcel
274183021Smarcel	/* Initialize the logical ID to APIC ID table. */
275132332Smarcel	for (i = 0; i < MAXCPU; i++) {
276183021Smarcel		cpu_apic_ids[i] = -1;
277132332Smarcel		cpu_ipi_pending[i] = 0;
278183021Smarcel	}
279132332Smarcel
280132332Smarcel	/* Set boot_cpu_id if needed. */
281132332Smarcel	if (boot_cpu_id == -1) {
282132332Smarcel		boot_cpu_id = PCPU_GET(apic_id);
283132332Smarcel		cpu_info[boot_cpu_id].cpu_bsp = 1;
284132332Smarcel	} else
285132332Smarcel		KASSERT(boot_cpu_id == PCPU_GET(apic_id),
286132332Smarcel		    ("BSP's APIC ID doesn't match boot_cpu_id"));
287132332Smarcel	cpu_apic_ids[0] = boot_cpu_id;
288132332Smarcel	apic_cpuids[boot_cpu_id] = 0;
289144663Sdavidxu
290132332Smarcel	assign_cpu_ids();
291132332Smarcel
292132332Smarcel	/* Start each Application Processor */
293132332Smarcel	start_all_aps();
294132332Smarcel
295132332Smarcel	/* Setup the initial logical CPUs info. */
296183021Smarcel	logical_cpus = logical_cpus_mask = 0;
297183021Smarcel	if (cpu_feature & CPUID_HTT)
298132332Smarcel		logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16;
299183021Smarcel
300132332Smarcel	set_interrupt_apic_ids();
301132332Smarcel}
302183021Smarcel
303132332Smarcel
304183021Smarcelstatic void
305132332Smarceliv_rendezvous(uintptr_t a, uintptr_t b)
306183021Smarcel{
307132332Smarcel	smp_rendezvous_action();
308183021Smarcel}
309183021Smarcel
310183021Smarcelstatic void
311132332Smarceliv_invltlb(uintptr_t a, uintptr_t b)
312183021Smarcel{
313183021Smarcel	xen_tlb_flush();
314132332Smarcel}
315132332Smarcel
316132332Smarcelstatic void
317132332Smarceliv_invlpg(uintptr_t a, uintptr_t b)
318132332Smarcel{
319144663Sdavidxu	xen_invlpg(a);
320132332Smarcel}
321132332Smarcel
322132332Smarcelstatic void
323132332Smarceliv_invlrng(uintptr_t a, uintptr_t b)
324183021Smarcel{
325132332Smarcel	vm_offset_t start = (vm_offset_t)a;
326183021Smarcel	vm_offset_t end = (vm_offset_t)b;
327132332Smarcel
328132332Smarcel	while (start < end) {
329132332Smarcel		xen_invlpg(start);
330132332Smarcel		start += PAGE_SIZE;
331132332Smarcel	}
332132332Smarcel}
333181341Smarcel
334181341Smarcel
335181341Smarcelstatic void
336132332Smarceliv_invlcache(uintptr_t a, uintptr_t b)
337132332Smarcel{
338132332Smarcel
339132332Smarcel	wbinvd();
340183021Smarcel	atomic_add_int(&smp_tlb_wait, 1);
341132332Smarcel}
342132332Smarcel
343132332Smarcelstatic void
344132332Smarceliv_lazypmap(uintptr_t a, uintptr_t b)
345132332Smarcel{
346132332Smarcel	pmap_lazyfix_action();
347132332Smarcel	atomic_add_int(&smp_tlb_wait, 1);
348132332Smarcel}
349132332Smarcel
350132332Smarcel/*
351183021Smarcel * These start from "IPI offset" APIC_IPI_INTS
352183021Smarcel */
353183021Smarcelstatic call_data_func_t *ipi_vectors[6] =
354132332Smarcel{
355132332Smarcel  iv_rendezvous,
356132332Smarcel  iv_invltlb,
357144663Sdavidxu  iv_invlpg,
358132332Smarcel  iv_invlrng,
359132332Smarcel  iv_invlcache,
360132332Smarcel  iv_lazypmap,
361132332Smarcel};
362132332Smarcel
363132332Smarcel/*
364183021Smarcel * Reschedule call back. Nothing to do,
365183021Smarcel * all the work is done automatically when
366183021Smarcel * we return from the interrupt.
367132332Smarcel */
368132332Smarcelstatic int
369132332Smarcelsmp_reschedule_interrupt(void *unused)
370132332Smarcel{
371132332Smarcel	int cpu = PCPU_GET(cpuid);
372132332Smarcel	u_int ipi_bitmap;
373132332Smarcel
374181341Smarcel	ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
375133802Sdavidxu
376133802Sdavidxu	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
377132332Smarcel#ifdef COUNT_IPIS
378132332Smarcel		(*ipi_preempt_counts[cpu])++;
379132332Smarcel#endif
380133802Sdavidxu		sched_preempt(curthread);
381133802Sdavidxu	}
382133802Sdavidxu
383132332Smarcel	if (ipi_bitmap & (1 << IPI_AST)) {
384133802Sdavidxu#ifdef COUNT_IPIS
385133805Sdavidxu		(*ipi_ast_counts[cpu])++;
386133805Sdavidxu#endif
387132332Smarcel		/* Nothing to do for AST */
388133805Sdavidxu	}
389133802Sdavidxu	return (FILTER_HANDLED);
390181341Smarcel}
391181341Smarcel
392181341Smarcelstruct _call_data {
393181341Smarcel	uint16_t func_id;
394133802Sdavidxu	uint16_t wait;
395133802Sdavidxu	uintptr_t arg1;
396133802Sdavidxu	uintptr_t arg2;
397133802Sdavidxu	atomic_t started;
398132332Smarcel	atomic_t finished;
399133802Sdavidxu};
400132332Smarcel
401132332Smarcelstatic struct _call_data *call_data;
402133802Sdavidxu
403132332Smarcelstatic int
404132332Smarcelsmp_call_function_interrupt(void *unused)
405132332Smarcel{
406132332Smarcel	call_data_func_t *func;
407181341Smarcel	uintptr_t arg1 = call_data->arg1;
408181341Smarcel	uintptr_t arg2 = call_data->arg2;
409132332Smarcel	int wait = call_data->wait;
410132332Smarcel	atomic_t *started = &call_data->started;
411144922Sdavidxu	atomic_t *finished = &call_data->finished;
412132332Smarcel
413132332Smarcel	/* We only handle function IPIs, not bitmap IPIs */
414132332Smarcel	if (call_data->func_id < APIC_IPI_INTS || call_data->func_id > IPI_BITMAP_VECTOR)
415181341Smarcel		panic("invalid function id %u", call_data->func_id);
416181341Smarcel
417132332Smarcel	func = ipi_vectors[call_data->func_id - APIC_IPI_INTS];
418132332Smarcel	/*
419144922Sdavidxu	 * Notify initiating CPU that I've grabbed the data and am
420132332Smarcel	 * about to execute the function
421132332Smarcel	 */
422132332Smarcel	mb();
423181341Smarcel	atomic_inc(started);
424181341Smarcel	/*
425132332Smarcel	 * At this point the info structure may be out of scope unless wait==1
426132332Smarcel	 */
427144922Sdavidxu	(*func)(arg1, arg2);
428132332Smarcel
429132332Smarcel	if (wait) {
430132332Smarcel		mb();
431181341Smarcel		atomic_inc(finished);
432181341Smarcel	}
433132332Smarcel	atomic_add_int(&smp_tlb_wait, 1);
434132332Smarcel	return (FILTER_HANDLED);
435132332Smarcel}
436132332Smarcel
437132332Smarcel/*
438132332Smarcel * Print various information about the SMP system hardware and setup.
439132951Sdavidxu */
440132951Sdavidxuvoid
441181341Smarcelcpu_mp_announce(void)
442132951Sdavidxu{
443132951Sdavidxu	int i, x;
444132951Sdavidxu
445133342Sdavidxu	/* List CPUs */
446132951Sdavidxu	printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id);
447132951Sdavidxu	for (i = 1, x = 0; x <= MAX_APIC_ID; x++) {
448132951Sdavidxu		if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp)
449132951Sdavidxu			continue;
450132951Sdavidxu		if (cpu_info[x].cpu_disabled)
451132951Sdavidxu			printf("  cpu (AP): APIC ID: %2d (disabled)\n", x);
452132951Sdavidxu		else {
453132951Sdavidxu			KASSERT(i < mp_ncpus,
454132951Sdavidxu			    ("mp_ncpus and actual cpus are out of whack"));
455132951Sdavidxu			printf(" cpu%d (AP): APIC ID: %2d\n", i++, x);
456132951Sdavidxu		}
457132951Sdavidxu	}
458132951Sdavidxu}
459132951Sdavidxu
460132951Sdavidxustatic int
461132951Sdavidxuxen_smp_intr_init(unsigned int cpu)
462133802Sdavidxu{
463132951Sdavidxu	int rc;
464132951Sdavidxu	unsigned int irq;
465132951Sdavidxu
466132951Sdavidxu	per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
467133802Sdavidxu
468133802Sdavidxu	sprintf(resched_name[cpu], "resched%u", cpu);
469132951Sdavidxu	rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
470132951Sdavidxu				    cpu,
471133802Sdavidxu				    resched_name[cpu],
472132951Sdavidxu				    smp_reschedule_interrupt,
473132951Sdavidxu	    INTR_FAST|INTR_TYPE_TTY|INTR_MPSAFE, &irq);
474132951Sdavidxu
475132951Sdavidxu	printf("[XEN] IPI cpu=%d irq=%d vector=RESCHEDULE_VECTOR (%d)\n",
476133342Sdavidxu	    cpu, irq, RESCHEDULE_VECTOR);
477133342Sdavidxu
478133342Sdavidxu	per_cpu(resched_irq, cpu) = irq;
479133802Sdavidxu
480133342Sdavidxu	sprintf(callfunc_name[cpu], "callfunc%u", cpu);
481133342Sdavidxu	rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR,
482133342Sdavidxu				    cpu,
483133342Sdavidxu				    callfunc_name[cpu],
484133342Sdavidxu				    smp_call_function_interrupt,
485133342Sdavidxu	    INTR_FAST|INTR_TYPE_TTY|INTR_MPSAFE, &irq);
486133802Sdavidxu	if (rc < 0)
487133342Sdavidxu		goto fail;
488133342Sdavidxu	per_cpu(callfunc_irq, cpu) = irq;
489133342Sdavidxu
490133802Sdavidxu	printf("[XEN] IPI cpu=%d irq=%d vector=CALL_FUNCTION_VECTOR (%d)\n",
491133802Sdavidxu	    cpu, irq, CALL_FUNCTION_VECTOR);
492133342Sdavidxu
493133342Sdavidxu
494133342Sdavidxu	if ((cpu != 0) && ((rc = ap_cpu_initclocks(cpu)) != 0))
495133342Sdavidxu		goto fail;
496133802Sdavidxu
497133342Sdavidxu	return 0;
498133342Sdavidxu
499133342Sdavidxu fail:
500133342Sdavidxu	if (per_cpu(resched_irq, cpu) >= 0)
501133342Sdavidxu		unbind_from_irqhandler(per_cpu(resched_irq, cpu));
502133342Sdavidxu	if (per_cpu(callfunc_irq, cpu) >= 0)
503133342Sdavidxu		unbind_from_irqhandler(per_cpu(callfunc_irq, cpu));
504133342Sdavidxu	return rc;
505133342Sdavidxu}
506132951Sdavidxu
507133342Sdavidxustatic void
508132951Sdavidxuxen_smp_intr_init_cpus(void *unused)
509132951Sdavidxu{
510133342Sdavidxu	int i;
511133342Sdavidxu
512133342Sdavidxu	for (i = 0; i < mp_ncpus; i++)
513133342Sdavidxu		xen_smp_intr_init(i);
514155413Sdavidxu}
515133342Sdavidxu
516133342Sdavidxu#define MTOPSIZE (1<<(14 + PAGE_SHIFT))
517133342Sdavidxu
518133342Sdavidxu/*
519133342Sdavidxu * AP CPU's call this to initialize themselves.
520133342Sdavidxu */
521133342Sdavidxuvoid
522133342Sdavidxuinit_secondary(void)
523133342Sdavidxu{
524133342Sdavidxu	vm_offset_t addr;
525132951Sdavidxu	int	gsel_tss;
526132951Sdavidxu
527132951Sdavidxu
528132951Sdavidxu	/* bootAP is set in start_ap() to our ID. */
529132951Sdavidxu	PCPU_SET(currentldt, _default_ldt);
530132951Sdavidxu	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
531132951Sdavidxu#if 0
532132951Sdavidxu	gdt[bootAP * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
533132951Sdavidxu#endif
534133047Sdavidxu	PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */
535132951Sdavidxu	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
536133047Sdavidxu	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
537132951Sdavidxu#if 0
538132951Sdavidxu	PCPU_SET(tss_gdt, &gdt[bootAP * NGDT + GPROC0_SEL].sd);
539132951Sdavidxu
540132951Sdavidxu	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
541132951Sdavidxu#endif
542132951Sdavidxu	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
543132951Sdavidxu
544132332Smarcel	/*
545132332Smarcel	 * Set to a known state:
546132332Smarcel	 * Set by mpboot.s: CR0_PG, CR0_PE
547132951Sdavidxu	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
548132951Sdavidxu	 */
549132332Smarcel	/*
550132332Smarcel	 * signal our startup to the BSP.
551132332Smarcel	 */
552132332Smarcel	mp_naps++;
553132332Smarcel
554132332Smarcel	/* Spin until the BSP releases the AP's. */
555132951Sdavidxu	while (!aps_ready)
556132951Sdavidxu		ia32_pause();
557132332Smarcel
558132332Smarcel	/* BSP may have changed PTD while we were waiting */
559132332Smarcel	invltlb();
560132332Smarcel	for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE)
561132332Smarcel		invlpg(addr);
562132332Smarcel
563132332Smarcel	/* set up FPU state on the AP */
564132332Smarcel	npxinit();
565132332Smarcel#if 0
566132332Smarcel
567132332Smarcel	/* set up SSE registers */
568132332Smarcel	enable_sse();
569132951Sdavidxu#endif
570132332Smarcel#if 0 && defined(PAE)
571132332Smarcel	/* Enable the PTE no-execute bit. */
572132332Smarcel	if ((amd_feature & AMDID_NX) != 0) {
573132332Smarcel		uint64_t msr;
574132332Smarcel
575132332Smarcel		msr = rdmsr(MSR_EFER) | EFER_NXE;
576158680Sdavidxu		wrmsr(MSR_EFER, msr);
577133802Sdavidxu	}
578133802Sdavidxu#endif
579158680Sdavidxu#if 0
580133802Sdavidxu	/* A quick check from sanity claus */
581132332Smarcel	if (PCPU_GET(apic_id) != lapic_id()) {
582158680Sdavidxu		printf("SMP: cpuid = %d\n", PCPU_GET(cpuid));
583132332Smarcel		printf("SMP: actual apic_id = %d\n", lapic_id());
584132332Smarcel		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
585132332Smarcel		panic("cpuid mismatch! boom!!");
586155387Sdavidxu	}
587132332Smarcel#endif
588132332Smarcel
589132332Smarcel	/* Initialize curthread. */
590132332Smarcel	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
591132332Smarcel	PCPU_SET(curthread, PCPU_GET(idlethread));
592132332Smarcel
593132332Smarcel	mtx_lock_spin(&ap_boot_mtx);
594132332Smarcel#if 0
595132332Smarcel
596132332Smarcel	/* Init local apic for irq's */
597132332Smarcel	lapic_setup(1);
598132332Smarcel#endif
599132332Smarcel	smp_cpus++;
600158680Sdavidxu
601158680Sdavidxu	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid));
602158680Sdavidxu	printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid));
603158680Sdavidxu
604158680Sdavidxu	/* Determine if we are a logical CPU. */
605158680Sdavidxu	if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0)
606133802Sdavidxu		logical_cpus_mask |= PCPU_GET(cpumask);
607133802Sdavidxu
608132332Smarcel	/* Determine if we are a hyperthread. */
609132332Smarcel	if (hyperthreading_cpus > 1 &&
610133802Sdavidxu	    PCPU_GET(apic_id) % hyperthreading_cpus != 0)
611133802Sdavidxu		hyperthreading_cpus_mask |= PCPU_GET(cpumask);
612132332Smarcel
613133802Sdavidxu	/* Build our map of 'other' CPUs. */
614133802Sdavidxu	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
615132332Smarcel#if 0
616132332Smarcel	if (bootverbose)
617132332Smarcel		lapic_dump("AP");
618132951Sdavidxu#endif
619133802Sdavidxu	if (smp_cpus == mp_ncpus) {
620133802Sdavidxu		/* enable IPI's, tlb shootdown, freezes etc */
621132951Sdavidxu		atomic_store_rel_int(&smp_started, 1);
622132951Sdavidxu		smp_active = 1;	 /* historic */
623132951Sdavidxu	}
624158680Sdavidxu
625158680Sdavidxu	mtx_unlock_spin(&ap_boot_mtx);
626158680Sdavidxu
627158680Sdavidxu	/* wait until all the AP's are up */
628132332Smarcel	while (smp_started == 0)
629132332Smarcel		ia32_pause();
630158680Sdavidxu
631158680Sdavidxu
632158680Sdavidxu	PCPU_SET(curthread, PCPU_GET(idlethread));
633158680Sdavidxu	/* enter the scheduler */
634158680Sdavidxu	sched_throw(NULL);
635158680Sdavidxu
636158680Sdavidxu	panic("scheduler returned us to %s", __func__);
637158680Sdavidxu	/* NOTREACHED */
638158680Sdavidxu}
639158680Sdavidxu
640158680Sdavidxu/*******************************************************************
641158680Sdavidxu * local functions and data
642158680Sdavidxu */
643158680Sdavidxu
644158680Sdavidxu/*
645158680Sdavidxu * We tell the I/O APIC code about all the CPUs we want to receive
646158680Sdavidxu * interrupts.  If we don't want certain CPUs to receive IRQs we
647158680Sdavidxu * can simply not tell the I/O APIC code about them in this function.
648158680Sdavidxu * We also do not tell it about the BSP since it tells itself about
649158680Sdavidxu * the BSP internally to work with UP kernels and on UP machines.
650158680Sdavidxu */
651133802Sdavidxustatic void
652132332Smarcelset_interrupt_apic_ids(void)
653133802Sdavidxu{
654133802Sdavidxu	u_int i, apic_id;
655133802Sdavidxu
656132332Smarcel	for (i = 0; i < MAXCPU; i++) {
657133047Sdavidxu		apic_id = cpu_apic_ids[i];
658132332Smarcel		if (apic_id == -1)
659132332Smarcel			continue;
660132332Smarcel		if (cpu_info[apic_id].cpu_bsp)
661132332Smarcel			continue;
662146818Sdfr		if (cpu_info[apic_id].cpu_disabled)
663132332Smarcel			continue;
664146818Sdfr
665146818Sdfr		/* Don't let hyperthreads service interrupts. */
666146818Sdfr		if (hyperthreading_cpus > 1 &&
667146818Sdfr		    apic_id % hyperthreading_cpus != 0)
668146818Sdfr			continue;
669146818Sdfr
670146818Sdfr		intr_add_cpu(i);
671146818Sdfr	}
672146818Sdfr}
673146818Sdfr
674146818Sdfr/*
675146818Sdfr * Assign logical CPU IDs to local APICs.
676146818Sdfr */
677146818Sdfrstatic void
678146818Sdfrassign_cpu_ids(void)
679146818Sdfr{
680146818Sdfr	u_int i;
681146818Sdfr
682146818Sdfr	/* Check for explicitly disabled CPUs. */
683146818Sdfr	for (i = 0; i <= MAX_APIC_ID; i++) {
684146818Sdfr		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp)
685146818Sdfr			continue;
686146818Sdfr
687146818Sdfr		/* Don't use this CPU if it has been disabled by a tunable. */
688146818Sdfr		if (resource_disabled("lapic", i)) {
689146818Sdfr			cpu_info[i].cpu_disabled = 1;
690146818Sdfr			continue;
691146818Sdfr		}
692146818Sdfr	}
693146818Sdfr
694146818Sdfr	/*
695146818Sdfr	 * Assign CPU IDs to local APIC IDs and disable any CPUs
696146818Sdfr	 * beyond MAXCPU.  CPU 0 has already been assigned to the BSP,
697146818Sdfr	 * so we only have to assign IDs for APs.
698146818Sdfr	 */
699146818Sdfr	mp_ncpus = 1;
700146818Sdfr	for (i = 0; i <= MAX_APIC_ID; i++) {
701146818Sdfr		if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp ||
702146818Sdfr		    cpu_info[i].cpu_disabled)
703146818Sdfr			continue;
704146818Sdfr
705146818Sdfr		if (mp_ncpus < MAXCPU) {
706146818Sdfr			cpu_apic_ids[mp_ncpus] = i;
707146818Sdfr			apic_cpuids[i] = mp_ncpus;
708132332Smarcel			mp_ncpus++;
709132332Smarcel		} else
710132332Smarcel			cpu_info[i].cpu_disabled = 1;
711132332Smarcel	}
712132332Smarcel	KASSERT(mp_maxid >= mp_ncpus - 1,
713132332Smarcel	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
714132332Smarcel	    mp_ncpus));
715132332Smarcel}
716132332Smarcel
717132332Smarcel/*
718132332Smarcel * start each AP in our list
719132332Smarcel */
720132332Smarcel/* Lowest 1MB is already mapped: don't touch*/
721132332Smarcel#define TMPMAP_START 1
722132332Smarcelint
723132332Smarcelstart_all_aps(void)
724132332Smarcel{
725132332Smarcel	int x,apic_id, cpu;
726132332Smarcel	struct pcpu *pc;
727133802Sdavidxu
728132951Sdavidxu	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
729132332Smarcel
730132332Smarcel	/* set up temporary P==V mapping for AP boot */
731133802Sdavidxu	/* XXX this is a hack, we should boot the AP on its own stack/PTD */
732132332Smarcel
733132332Smarcel	/* start each AP */
734132332Smarcel	for (cpu = 1; cpu < mp_ncpus; cpu++) {
735132332Smarcel		apic_id = cpu_apic_ids[cpu];
736132332Smarcel
737132332Smarcel
738132332Smarcel		bootAP = cpu;
739132332Smarcel		bootAPgdt = gdt + (512*cpu);
740132332Smarcel
741132332Smarcel		/* Get per-cpu data */
742132332Smarcel		pc = &__pcpu[bootAP];
743132332Smarcel		pcpu_init(pc, bootAP, sizeof(struct pcpu));
744132332Smarcel		dpcpu_init((void *)kmem_alloc(kernel_map, DPCPU_SIZE), bootAP);
745132332Smarcel		pc->pc_apic_id = cpu_apic_ids[bootAP];
746132332Smarcel		pc->pc_prvspace = pc;
747132332Smarcel		pc->pc_curthread = 0;
748132332Smarcel
749132332Smarcel		gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
750132332Smarcel		gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
751132332Smarcel
752132332Smarcel		PT_SET_MA(bootAPgdt, xpmap_ptom(VTOP(bootAPgdt)) | PG_V | PG_RW);
753132332Smarcel		bzero(bootAPgdt, PAGE_SIZE);
754132332Smarcel		for (x = 0; x < NGDT; x++)
755132332Smarcel			ssdtosd(&gdt_segs[x], &bootAPgdt[x].sd);
756132332Smarcel		PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V);
757132332Smarcel#ifdef notyet
758132332Smarcel
759132332Smarcel                if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) {
760132332Smarcel                        apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id);
761132332Smarcel                        acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id);
762132332Smarcel#ifdef CONFIG_ACPI
763132332Smarcel                        if (acpiid != 0xff)
764132332Smarcel                                x86_acpiid_to_apicid[acpiid] = apicid;
765132332Smarcel#endif
766132332Smarcel                }
767132332Smarcel#endif
768132332Smarcel
769133802Sdavidxu		/* attempt to start the Application Processor */
770132332Smarcel		if (!start_ap(cpu)) {
771132332Smarcel			printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id);
772132332Smarcel			/* better panic as the AP may be running loose */
773133802Sdavidxu			printf("panic y/n? [y] ");
774132332Smarcel			if (cngetc() != 'n')
775132332Smarcel				panic("bye-bye");
776132332Smarcel		}
777132332Smarcel
778132332Smarcel		all_cpus |= (1 << cpu);		/* record AP in CPU map */
779132332Smarcel	}
780132332Smarcel
781132332Smarcel
782132332Smarcel	/* build our map of 'other' CPUs */
783132332Smarcel	PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask));
784132332Smarcel
785132332Smarcel	pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1);
786132332Smarcel
787132332Smarcel	/* number of APs actually started */
788132332Smarcel	return mp_naps;
789146818Sdfr}
790132332Smarcel
791146818Sdfrextern uint8_t *pcpu_boot_stack;
792146818Sdfrextern trap_info_t trap_table[];
793146818Sdfr
794146818Sdfrstatic void
795146818Sdfrsmp_trap_init(trap_info_t *trap_ctxt)
796146818Sdfr{
797146818Sdfr        const trap_info_t *t = trap_table;
798146818Sdfr
799146818Sdfr        for (t = trap_table; t->address; t++) {
800146818Sdfr                trap_ctxt[t->vector].flags = t->flags;
801146818Sdfr                trap_ctxt[t->vector].cs = t->cs;
802146818Sdfr                trap_ctxt[t->vector].address = t->address;
803146818Sdfr        }
804146818Sdfr}
805146818Sdfr
806146818Sdfrextern int nkpt;
807146818Sdfrstatic void
808146818Sdfrcpu_initialize_context(unsigned int cpu)
809146818Sdfr{
810146818Sdfr	/* vcpu_guest_context_t is too large to allocate on the stack.
811146818Sdfr	 * Hence we allocate statically and protect it with a lock */
812146818Sdfr	vm_page_t m[4];
813146818Sdfr	static vcpu_guest_context_t ctxt;
814146818Sdfr	vm_offset_t boot_stack;
815146818Sdfr	vm_offset_t newPTD;
816146818Sdfr	vm_paddr_t ma[NPGPTD];
817146818Sdfr	static int color;
818146818Sdfr	int i;
819146818Sdfr
820146818Sdfr	/*
821146818Sdfr	 * Page 0,[0-3]	PTD
822146818Sdfr	 * Page 1, [4]	boot stack
823146818Sdfr	 * Page [5]	PDPT
824146818Sdfr	 *
825146818Sdfr	 */
826146818Sdfr	for (i = 0; i < NPGPTD + 2; i++) {
827146818Sdfr		m[i] = vm_page_alloc(NULL, color++,
828146818Sdfr		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
829146818Sdfr		    VM_ALLOC_ZERO);
830146818Sdfr
831146818Sdfr		pmap_zero_page(m[i]);
832146818Sdfr
833146818Sdfr	}
834146818Sdfr	boot_stack = kmem_alloc_nofault(kernel_map, 1);
835146818Sdfr	newPTD = kmem_alloc_nofault(kernel_map, NPGPTD);
836146818Sdfr	ma[0] = xpmap_ptom(VM_PAGE_TO_PHYS(m[0]))|PG_V;
837146818Sdfr
838146818Sdfr#ifdef PAE
839146818Sdfr	pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD + 1]));
840146818Sdfr	for (i = 0; i < NPGPTD; i++) {
841132332Smarcel		((vm_paddr_t *)boot_stack)[i] =
842132332Smarcel		ma[i] =
843132332Smarcel		    xpmap_ptom(VM_PAGE_TO_PHYS(m[i]))|PG_V;
844132332Smarcel	}
845132332Smarcel#endif
846132332Smarcel
847132332Smarcel	/*
848132332Smarcel	 * Copy cpu0 IdlePTD to new IdlePTD - copying only
849132332Smarcel	 * kernel mappings
850132332Smarcel	 */
851132332Smarcel	pmap_qenter(newPTD, m, 4);
852132332Smarcel
853132332Smarcel	memcpy((uint8_t *)newPTD + KPTDI*sizeof(vm_paddr_t),
854132332Smarcel	    (uint8_t *)PTOV(IdlePTD) + KPTDI*sizeof(vm_paddr_t),
855132332Smarcel	    nkpt*sizeof(vm_paddr_t));
856132332Smarcel
857132332Smarcel	pmap_qremove(newPTD, 4);
858132332Smarcel	kmem_free(kernel_map, newPTD, 4);
859132332Smarcel	/*
860132332Smarcel	 * map actual idle stack to boot_stack
861133802Sdavidxu	 */
862132332Smarcel	pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD]));
863132332Smarcel
864132332Smarcel
865133802Sdavidxu	xen_pgdpt_pin(xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1])));
866132332Smarcel	vm_page_lock_queues();
867132332Smarcel	for (i = 0; i < 4; i++) {
868132332Smarcel		int pdir = (PTDPTDI + i) / NPDEPG;
869132332Smarcel		int curoffset = (PTDPTDI + i) % NPDEPG;
870132332Smarcel
871132332Smarcel		xen_queue_pt_update((vm_paddr_t)
872132332Smarcel		    ((ma[pdir] & ~PG_V) + (curoffset*sizeof(vm_paddr_t))),
873132332Smarcel		    ma[i]);
874132332Smarcel	}
875132332Smarcel	PT_UPDATES_FLUSH();
876132332Smarcel	vm_page_unlock_queues();
877132332Smarcel
878132332Smarcel	memset(&ctxt, 0, sizeof(ctxt));
879132332Smarcel	ctxt.flags = VGCF_IN_KERNEL;
880132332Smarcel	ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL);
881132332Smarcel	ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL);
882132332Smarcel	ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL);
883132332Smarcel	ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL);
884132332Smarcel	ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL);
885132332Smarcel	ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL);
886132332Smarcel	ctxt.user_regs.eip = (unsigned long)init_secondary;
887132332Smarcel	ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */
888132332Smarcel
889132332Smarcel	memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
890132332Smarcel
891132332Smarcel	smp_trap_init(ctxt.trap_ctxt);
892132332Smarcel
893132332Smarcel	ctxt.ldt_ents = 0;
894132332Smarcel	ctxt.gdt_frames[0] = (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT);
895132332Smarcel	ctxt.gdt_ents      = 512;
896132332Smarcel
897132332Smarcel#ifdef __i386__
898132332Smarcel	ctxt.user_regs.esp = boot_stack + PAGE_SIZE;
899132332Smarcel
900132332Smarcel	ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL);
901132332Smarcel	ctxt.kernel_sp = boot_stack + PAGE_SIZE;
902132332Smarcel
903132332Smarcel	ctxt.event_callback_cs     = GSEL(GCODE_SEL, SEL_KPL);
904132332Smarcel	ctxt.event_callback_eip    = (unsigned long)Xhypervisor_callback;
905132332Smarcel	ctxt.failsafe_callback_cs  = GSEL(GCODE_SEL, SEL_KPL);
906132332Smarcel	ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
907132332Smarcel
908133802Sdavidxu	ctxt.ctrlreg[3] = xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1]));
909132332Smarcel#else /* __x86_64__ */
910132332Smarcel	ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
911132332Smarcel	ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL);
912133802Sdavidxu	ctxt.kernel_sp = idle->thread.rsp0;
913132332Smarcel
914132332Smarcel	ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
915132332Smarcel	ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
916132332Smarcel	ctxt.syscall_callback_eip  = (unsigned long)system_call;
917132332Smarcel
918132332Smarcel	ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
919132332Smarcel
920132332Smarcel	ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu));
921132332Smarcel#endif
922132332Smarcel
923132332Smarcel	printf("gdtpfn=%lx pdptpfn=%lx\n",
924132332Smarcel	    ctxt.gdt_frames[0],
925132332Smarcel	    ctxt.ctrlreg[3] >> PAGE_SHIFT);
926132332Smarcel
927132332Smarcel	PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt));
928132332Smarcel	DELAY(3000);
929132332Smarcel	PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL));
930132332Smarcel}
931132332Smarcel
932132332Smarcel/*
933132332Smarcel * This function starts the AP (application processor) identified
934132332Smarcel * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
935181341Smarcel * to accomplish this.  This is necessary because of the nuances
936132332Smarcel * of the different hardware we might encounter.  It isn't pretty,
937132332Smarcel * but it seems to work.
938144922Sdavidxu */
939132332Smarcel
940132332Smarcelint cpus;
941132332Smarcelstatic int
942181341Smarcelstart_ap(int apic_id)
943181341Smarcel{
944132332Smarcel	int ms;
945132332Smarcel
946144922Sdavidxu	/* used as a watchpoint to signal AP startup */
947132332Smarcel	cpus = mp_naps;
948132332Smarcel
949132332Smarcel	cpu_initialize_context(apic_id);
950181341Smarcel
951181341Smarcel	/* Wait up to 5 seconds for it to start. */
952132332Smarcel	for (ms = 0; ms < 5000; ms++) {
953132332Smarcel		if (mp_naps > cpus)
954144922Sdavidxu			return 1;	/* return SUCCESS */
955132332Smarcel		DELAY(1000);
956132332Smarcel	}
957132332Smarcel	return 0;		/* return FAILURE */
958181341Smarcel}
959181341Smarcel
960132332Smarcel/*
961132332Smarcel * Flush the TLB on all other CPU's
962132332Smarcel */
963132332Smarcelstatic void
964132332Smarcelsmp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
965132332Smarcel{
966132332Smarcel	u_int ncpu;
967132332Smarcel	struct _call_data data;
968132332Smarcel
969132332Smarcel	ncpu = mp_ncpus - 1;	/* does not shootdown self */
970132332Smarcel	if (ncpu < 1)
971132332Smarcel		return;		/* no other cpus */
972132951Sdavidxu	if (!(read_eflags() & PSL_I))
973132332Smarcel		panic("%s: interrupts disabled", __func__);
974132332Smarcel	mtx_lock_spin(&smp_ipi_mtx);
975132332Smarcel	KASSERT(call_data == NULL, ("call_data isn't null?!"));
976132332Smarcel	call_data = &data;
977132332Smarcel	call_data->func_id = vector;
978132332Smarcel	call_data->arg1 = addr1;
979132332Smarcel	call_data->arg2 = addr2;
980132332Smarcel	atomic_store_rel_int(&smp_tlb_wait, 0);
981132332Smarcel	ipi_all_but_self(vector);
982132332Smarcel	while (smp_tlb_wait < ncpu)
983132332Smarcel		ia32_pause();
984132332Smarcel	call_data = NULL;
985132332Smarcel	mtx_unlock_spin(&smp_ipi_mtx);
986133802Sdavidxu}
987132332Smarcel
988132332Smarcelstatic void
989132332Smarcelsmp_targeted_tlb_shootdown(cpumask_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
990132332Smarcel{
991132332Smarcel	int ncpu, othercpus;
992133802Sdavidxu	struct _call_data data;
993133802Sdavidxu
994133802Sdavidxu	othercpus = mp_ncpus - 1;
995133802Sdavidxu	if (mask == (u_int)-1) {
996132951Sdavidxu		ncpu = othercpus;
997132951Sdavidxu		if (ncpu < 1)
998132951Sdavidxu			return;
999132951Sdavidxu	} else {
1000132951Sdavidxu		mask &= ~PCPU_GET(cpumask);
1001132951Sdavidxu		if (mask == 0)
1002133802Sdavidxu			return;
1003133802Sdavidxu		ncpu = bitcount32(mask);
1004133802Sdavidxu		if (ncpu > othercpus) {
1005133802Sdavidxu			/* XXX this should be a panic offence */
1006132332Smarcel			printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
1007132332Smarcel			    ncpu, othercpus);
1008132332Smarcel			ncpu = othercpus;
1009133802Sdavidxu		}
1010133802Sdavidxu		/* XXX should be a panic, implied by mask == 0 above */
1011133802Sdavidxu		if (ncpu < 1)
1012133802Sdavidxu			return;
1013132332Smarcel	}
1014132332Smarcel	if (!(read_eflags() & PSL_I))
1015132332Smarcel		panic("%s: interrupts disabled", __func__);
1016132951Sdavidxu	mtx_lock_spin(&smp_ipi_mtx);
1017132332Smarcel	KASSERT(call_data == NULL, ("call_data isn't null?!"));
1018133802Sdavidxu	call_data = &data;
1019132332Smarcel	call_data->func_id = vector;
1020132332Smarcel	call_data->arg1 = addr1;
1021132332Smarcel	call_data->arg2 = addr2;
1022132332Smarcel	atomic_store_rel_int(&smp_tlb_wait, 0);
1023132332Smarcel	if (mask == (u_int)-1)
1024132332Smarcel		ipi_all_but_self(vector);
1025132332Smarcel	else
1026132332Smarcel		ipi_selected(mask, vector);
1027132332Smarcel	while (smp_tlb_wait < ncpu)
1028132332Smarcel		ia32_pause();
1029132332Smarcel	call_data = NULL;
1030132332Smarcel	mtx_unlock_spin(&smp_ipi_mtx);
1031132332Smarcel}
1032132332Smarcel
1033132332Smarcelvoid
1034132332Smarcelsmp_cache_flush(void)
1035132332Smarcel{
1036132332Smarcel
1037132332Smarcel	if (smp_started)
1038132332Smarcel		smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
1039132332Smarcel}
1040132332Smarcel
1041132332Smarcelvoid
1042132332Smarcelsmp_invltlb(void)
1043132332Smarcel{
1044132332Smarcel
1045132332Smarcel	if (smp_started) {
1046132332Smarcel		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
1047132332Smarcel	}
1048132332Smarcel}
1049132332Smarcel
1050132332Smarcelvoid
1051132332Smarcelsmp_invlpg(vm_offset_t addr)
1052132332Smarcel{
1053132332Smarcel
1054132332Smarcel	if (smp_started) {
1055132332Smarcel		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
1056132332Smarcel	}
1057132332Smarcel}
1058132332Smarcel
1059132332Smarcelvoid
1060181059Smarcelsmp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
1061180982Smarcel{
1062180982Smarcel
1063133342Sdavidxu	if (smp_started) {
1064133342Sdavidxu		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
1065180982Smarcel	}
1066133342Sdavidxu}
1067133342Sdavidxu
1068133342Sdavidxuvoid
1069180982Smarcelsmp_masked_invltlb(cpumask_t mask)
1070133342Sdavidxu{
1071133342Sdavidxu
1072133342Sdavidxu	if (smp_started) {
1073133802Sdavidxu		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
1074133342Sdavidxu	}
1075133342Sdavidxu}
1076133342Sdavidxu
1077133342Sdavidxuvoid
1078133342Sdavidxusmp_masked_invlpg(cpumask_t mask, vm_offset_t addr)
1079133342Sdavidxu{
1080133802Sdavidxu
1081133342Sdavidxu	if (smp_started) {
1082133342Sdavidxu		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
1083133342Sdavidxu	}
1084133342Sdavidxu}
1085133342Sdavidxu
1086133802Sdavidxuvoid
1087133342Sdavidxusmp_masked_invlpg_range(cpumask_t mask, vm_offset_t addr1, vm_offset_t addr2)
1088133342Sdavidxu{
1089133342Sdavidxu
1090133342Sdavidxu	if (smp_started) {
1091180982Smarcel		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
1092180982Smarcel	}
1093133342Sdavidxu}
1094133342Sdavidxu
1095133342Sdavidxu/*
1096133342Sdavidxu * send an IPI to a set of cpus.
1097133342Sdavidxu */
1098133342Sdavidxuvoid
1099133342Sdavidxuipi_selected(cpumask_t cpus, u_int ipi)
1100132332Smarcel{
1101132332Smarcel	int cpu;
1102132332Smarcel	u_int bitmap = 0;
1103132332Smarcel	u_int old_pending;
1104132332Smarcel	u_int new_pending;
1105132332Smarcel
1106132332Smarcel	if (IPI_IS_BITMAPED(ipi)) {
1107132332Smarcel		bitmap = 1 << ipi;
1108132332Smarcel		ipi = IPI_BITMAP_VECTOR;
1109132332Smarcel	}
1110132332Smarcel
1111132332Smarcel	/*
1112132332Smarcel	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1113132332Smarcel	 * of help in order to understand what is the source.
1114132332Smarcel	 * Set the mask of receiving CPUs for this purpose.
1115132332Smarcel	 */
1116132332Smarcel	if (ipi == IPI_STOP_HARD)
1117132332Smarcel		atomic_set_int(&ipi_nmi_pending, cpus);
1118132332Smarcel
1119132332Smarcel	CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi);
1120132332Smarcel	while ((cpu = ffs(cpus)) != 0) {
1121132332Smarcel		cpu--;
1122132332Smarcel		cpus &= ~(1 << cpu);
1123132332Smarcel
1124133342Sdavidxu		KASSERT(cpu_apic_ids[cpu] != -1,
1125132332Smarcel		    ("IPI to non-existent CPU %d", cpu));
1126132332Smarcel
1127132332Smarcel		if (bitmap) {
1128146818Sdfr			do {
1129146818Sdfr				old_pending = cpu_ipi_pending[cpu];
1130146818Sdfr				new_pending = old_pending | bitmap;
1131146818Sdfr			} while  (!atomic_cmpset_int(&cpu_ipi_pending[cpu],old_pending, new_pending));
1132132332Smarcel
1133177490Sdavidxu			if (!old_pending)
1134177490Sdavidxu				ipi_pcpu(cpu, RESCHEDULE_VECTOR);
1135			continue;
1136
1137		} else {
1138			KASSERT(call_data != NULL, ("call_data not set"));
1139			ipi_pcpu(cpu, CALL_FUNCTION_VECTOR);
1140		}
1141	}
1142}
1143
1144/*
1145 * send an IPI to all CPUs EXCEPT myself
1146 */
1147void
1148ipi_all_but_self(u_int ipi)
1149{
1150
1151	/*
1152	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1153	 * of help in order to understand what is the source.
1154	 * Set the mask of receiving CPUs for this purpose.
1155	 */
1156	if (ipi == IPI_STOP_HARD)
1157		atomic_set_int(&ipi_nmi_pending, PCPU_GET(other_cpus));
1158
1159	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1160	ipi_selected(PCPU_GET(other_cpus), ipi);
1161}
1162
1163int
1164ipi_nmi_handler()
1165{
1166	cpumask_t cpumask;
1167
1168	/*
1169	 * As long as there is not a simple way to know about a NMI's
1170	 * source, if the bitmask for the current CPU is present in
1171	 * the global pending bitword an IPI_STOP_HARD has been issued
1172	 * and should be handled.
1173	 */
1174	cpumask = PCPU_GET(cpumask);
1175	if ((ipi_nmi_pending & cpumask) == 0)
1176		return (1);
1177
1178	atomic_clear_int(&ipi_nmi_pending, cpumask);
1179	cpustop_handler();
1180	return (0);
1181}
1182
1183/*
1184 * Handle an IPI_STOP by saving our current context and spinning until we
1185 * are resumed.
1186 */
1187void
1188cpustop_handler(void)
1189{
1190	int cpu = PCPU_GET(cpuid);
1191	int cpumask = PCPU_GET(cpumask);
1192
1193	savectx(&stoppcbs[cpu]);
1194
1195	/* Indicate that we are stopped */
1196	atomic_set_int(&stopped_cpus, cpumask);
1197
1198	/* Wait for restart */
1199	while (!(started_cpus & cpumask))
1200	    ia32_pause();
1201
1202	atomic_clear_int(&started_cpus, cpumask);
1203	atomic_clear_int(&stopped_cpus, cpumask);
1204
1205	if (cpu == 0 && cpustop_restartfunc != NULL) {
1206		cpustop_restartfunc();
1207		cpustop_restartfunc = NULL;
1208	}
1209}
1210
1211/*
1212 * This is called once the rest of the system is up and running and we're
1213 * ready to let the AP's out of the pen.
1214 */
1215static void
1216release_aps(void *dummy __unused)
1217{
1218
1219	if (mp_ncpus == 1)
1220		return;
1221	atomic_store_rel_int(&aps_ready, 1);
1222	while (smp_started == 0)
1223		ia32_pause();
1224}
1225SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1226SYSINIT(start_ipis, SI_SUB_INTR, SI_ORDER_ANY, xen_smp_intr_init_cpus, NULL);
1227
1228