1/*
2 *
3 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
4 *
5 * This file contains Original Code and/or Modifications of Original Code
6 * as defined in and that are subject to the Apple Public Source License
7 * Version 2.0 (the 'License'). You may not use this file except in
8 * compliance with the License. The rights granted to you under the License
9 * may not be used to create, or enable the creation or redistribution of,
10 * unlawful or unlicensed copies of an Apple operating system, or to
11 * circumvent, violate, or enable the circumvention or violation of, any
12 * terms of an Apple operating system software license agreement.
13 *
14 * Please obtain a copy of the License at
15 * http://www.opensource.apple.com/apsl/ and read it before using this file.
16 *
17 * The Original Code and all software distributed under the License are
18 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
19 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
20 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
22 * Please see the License for the specific language governing rights and
23 * limitations under the License.
24 *
25 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
26 */
27/*
28 * @OSF_COPYRIGHT@
29 */
30
31#include <mach_rt.h>
32#include <mach_kdp.h>
33#include <mach_ldebug.h>
34#include <gprof.h>
35
36#include <mach/mach_types.h>
37#include <mach/kern_return.h>
38
39#include <kern/kern_types.h>
40#include <kern/startup.h>
41#include <kern/timer_queue.h>
42#include <kern/processor.h>
43#include <kern/cpu_number.h>
44#include <kern/cpu_data.h>
45#include <kern/assert.h>
46#include <kern/machine.h>
47#include <kern/pms.h>
48#include <kern/misc_protos.h>
49#include <kern/etimer.h>
50#include <kern/kalloc.h>
51#include <kern/queue.h>
52
53#include <vm/vm_map.h>
54#include <vm/vm_kern.h>
55
56#include <profiling/profile-mk.h>
57
58#include <i386/proc_reg.h>
59#include <i386/cpu_threads.h>
60#include <i386/mp_desc.h>
61#include <i386/misc_protos.h>
62#include <i386/trap.h>
63#include <i386/postcode.h>
64#include <i386/machine_routines.h>
65#include <i386/mp.h>
66#include <i386/mp_events.h>
67#include <i386/lapic.h>
68#include <i386/cpuid.h>
69#include <i386/fpu.h>
70#include <i386/machine_cpu.h>
71#include <i386/pmCPU.h>
72#if CONFIG_MCA
73#include <i386/machine_check.h>
74#endif
75#include <i386/acpi.h>
76
77#include <chud/chud_xnu.h>
78#include <chud/chud_xnu_private.h>
79
80#include <sys/kdebug.h>
81
82#if	MP_DEBUG
83#define PAUSE		delay(1000000)
84#define DBG(x...)	kprintf(x)
85#else
86#define DBG(x...)
87#define PAUSE
88#endif	/* MP_DEBUG */
89
90/* Debugging/test trace events: */
91#define	TRACE_MP_TLB_FLUSH		MACHDBG_CODE(DBG_MACH_MP, 0)
92#define	TRACE_MP_CPUS_CALL		MACHDBG_CODE(DBG_MACH_MP, 1)
93#define	TRACE_MP_CPUS_CALL_LOCAL	MACHDBG_CODE(DBG_MACH_MP, 2)
94#define	TRACE_MP_CPUS_CALL_ACTION	MACHDBG_CODE(DBG_MACH_MP, 3)
95#define	TRACE_MP_CPUS_CALL_NOBUF	MACHDBG_CODE(DBG_MACH_MP, 4)
96#define	TRACE_MP_CPU_FAST_START		MACHDBG_CODE(DBG_MACH_MP, 5)
97#define	TRACE_MP_CPU_START		MACHDBG_CODE(DBG_MACH_MP, 6)
98#define	TRACE_MP_CPU_DEACTIVATE		MACHDBG_CODE(DBG_MACH_MP, 7)
99
100#define ABS(v)		(((v) > 0)?(v):-(v))
101
102void 		slave_boot_init(void);
103void		i386_cpu_IPI(int cpu);
104
105static void	mp_kdp_wait(boolean_t flush, boolean_t isNMI);
106static void	mp_rendezvous_action(void);
107static void 	mp_broadcast_action(void);
108
109static boolean_t	cpu_signal_pending(int cpu, mp_event_t event);
110static int		NMIInterruptHandler(x86_saved_state_t *regs);
111
112boolean_t 		smp_initialized = FALSE;
113uint32_t 		TSC_sync_margin = 0xFFF;
114volatile boolean_t	force_immediate_debugger_NMI = FALSE;
115volatile boolean_t	pmap_tlb_flush_timeout = FALSE;
116decl_simple_lock_data(,mp_kdp_lock);
117
118decl_lck_mtx_data(static, mp_cpu_boot_lock);
119lck_mtx_ext_t	mp_cpu_boot_lock_ext;
120
121/* Variables needed for MP rendezvous. */
122decl_simple_lock_data(,mp_rv_lock);
123static void	(*mp_rv_setup_func)(void *arg);
124static void	(*mp_rv_action_func)(void *arg);
125static void	(*mp_rv_teardown_func)(void *arg);
126static void	*mp_rv_func_arg;
127static volatile int	mp_rv_ncpus;
128			/* Cache-aligned barriers: */
129static volatile long	mp_rv_entry    __attribute__((aligned(64)));
130static volatile long	mp_rv_exit     __attribute__((aligned(64)));
131static volatile long	mp_rv_complete __attribute__((aligned(64)));
132
133volatile	uint64_t	debugger_entry_time;
134volatile	uint64_t	debugger_exit_time;
135#if MACH_KDP
136#include <kdp/kdp.h>
137extern int kdp_snapshot;
138static struct _kdp_xcpu_call_func {
139	kdp_x86_xcpu_func_t func;
140	void     *arg0, *arg1;
141	volatile long     ret;
142	volatile uint16_t cpu;
143} kdp_xcpu_call_func = {
144	.cpu  = KDP_XCPU_NONE
145};
146
147#endif
148
149/* Variables needed for MP broadcast. */
150static void        (*mp_bc_action_func)(void *arg);
151static void        *mp_bc_func_arg;
152static int     	mp_bc_ncpus;
153static volatile long   mp_bc_count;
154decl_lck_mtx_data(static, mp_bc_lock);
155lck_mtx_ext_t	mp_bc_lock_ext;
156static	volatile int 	debugger_cpu = -1;
157volatile long NMIPI_acks = 0;
158
159static void	mp_cpus_call_init(void);
160static void	mp_cpus_call_cpu_init(void);
161static void	mp_cpus_call_action(void);
162static void	mp_call_PM(void);
163
164char		mp_slave_stack[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); // Temp stack for slave init
165
166/* PAL-related routines */
167boolean_t i386_smp_init(int nmi_vector, i386_intr_func_t nmi_handler,
168		int ipi_vector, i386_intr_func_t ipi_handler);
169void i386_start_cpu(int lapic_id, int cpu_num);
170void i386_send_NMI(int cpu);
171
172#if GPROF
173/*
174 * Initialize dummy structs for profiling. These aren't used but
175 * allows hertz_tick() to be built with GPROF defined.
176 */
177struct profile_vars _profile_vars;
178struct profile_vars *_profile_vars_cpus[MAX_CPUS] = { &_profile_vars };
179#define GPROF_INIT()							\
180{									\
181	int	i;							\
182									\
183	/* Hack to initialize pointers to unused profiling structs */	\
184	for (i = 1; i < MAX_CPUS; i++)				\
185		_profile_vars_cpus[i] = &_profile_vars;			\
186}
187#else
188#define GPROF_INIT()
189#endif /* GPROF */
190
191static lck_grp_t 	smp_lck_grp;
192static lck_grp_attr_t	smp_lck_grp_attr;
193
194#define NUM_CPU_WARM_CALLS	20
195struct timer_call	cpu_warm_call_arr[NUM_CPU_WARM_CALLS];
196queue_head_t 		cpu_warm_call_list;
197decl_simple_lock_data(static, cpu_warm_lock);
198
199typedef struct cpu_warm_data {
200	timer_call_t 	cwd_call;
201	uint64_t	cwd_deadline;
202	int		cwd_result;
203} *cpu_warm_data_t;
204
205static void		cpu_prewarm_init(void);
206static void 		cpu_warm_timer_call_func(call_entry_param_t p0, call_entry_param_t p1);
207static void 		_cpu_warm_setup(void *arg);
208static timer_call_t 	grab_warm_timer_call(void);
209static void		free_warm_timer_call(timer_call_t call);
210
211void
212smp_init(void)
213{
214	simple_lock_init(&mp_kdp_lock, 0);
215	simple_lock_init(&mp_rv_lock, 0);
216	lck_grp_attr_setdefault(&smp_lck_grp_attr);
217	lck_grp_init(&smp_lck_grp, "i386_smp", &smp_lck_grp_attr);
218	lck_mtx_init_ext(&mp_cpu_boot_lock, &mp_cpu_boot_lock_ext, &smp_lck_grp, LCK_ATTR_NULL);
219	lck_mtx_init_ext(&mp_bc_lock, &mp_bc_lock_ext, &smp_lck_grp, LCK_ATTR_NULL);
220	console_init();
221
222	if(!i386_smp_init(LAPIC_NMI_INTERRUPT, NMIInterruptHandler,
223				LAPIC_VECTOR(INTERPROCESSOR), cpu_signal_handler))
224		return;
225
226	cpu_thread_init();
227
228	GPROF_INIT();
229	DBGLOG_CPU_INIT(master_cpu);
230
231	mp_cpus_call_init();
232	mp_cpus_call_cpu_init();
233
234	if (PE_parse_boot_argn("TSC_sync_margin",
235					&TSC_sync_margin, sizeof(TSC_sync_margin))) {
236		kprintf("TSC sync Margin 0x%x\n", TSC_sync_margin);
237	} else if (cpuid_vmm_present()) {
238		kprintf("TSC sync margin disabled\n");
239		TSC_sync_margin = 0;
240	}
241	smp_initialized = TRUE;
242
243	cpu_prewarm_init();
244
245	return;
246}
247
248typedef struct {
249	int			target_cpu;
250	int			target_lapic;
251	int			starter_cpu;
252} processor_start_info_t;
253static processor_start_info_t	start_info	  __attribute__((aligned(64)));
254
255/*
256 * Cache-alignment is to avoid cross-cpu false-sharing interference.
257 */
258static volatile long		tsc_entry_barrier __attribute__((aligned(64)));
259static volatile long		tsc_exit_barrier  __attribute__((aligned(64)));
260static volatile uint64_t	tsc_target	  __attribute__((aligned(64)));
261
262/*
263 * Poll a CPU to see when it has marked itself as running.
264 */
265static void
266mp_wait_for_cpu_up(int slot_num, unsigned int iters, unsigned int usecdelay)
267{
268	while (iters-- > 0) {
269		if (cpu_datap(slot_num)->cpu_running)
270			break;
271		delay(usecdelay);
272	}
273}
274
275/*
276 * Quickly bring a CPU back online which has been halted.
277 */
278kern_return_t
279intel_startCPU_fast(int slot_num)
280{
281	kern_return_t	rc;
282
283	/*
284	 * Try to perform a fast restart
285	 */
286	rc = pmCPUExitHalt(slot_num);
287	if (rc != KERN_SUCCESS)
288		/*
289		 * The CPU was not eligible for a fast restart.
290		 */
291		return(rc);
292
293	KERNEL_DEBUG_CONSTANT(
294		TRACE_MP_CPU_FAST_START | DBG_FUNC_START,
295		slot_num, 0, 0, 0, 0);
296
297	/*
298	 * Wait until the CPU is back online.
299	 */
300	mp_disable_preemption();
301
302	/*
303	 * We use short pauses (1us) for low latency.  30,000 iterations is
304	 * longer than a full restart would require so it should be more
305	 * than long enough.
306	 */
307
308	mp_wait_for_cpu_up(slot_num, 30000, 1);
309	mp_enable_preemption();
310
311	KERNEL_DEBUG_CONSTANT(
312		TRACE_MP_CPU_FAST_START | DBG_FUNC_END,
313		slot_num, cpu_datap(slot_num)->cpu_running, 0, 0, 0);
314
315	/*
316	 * Check to make sure that the CPU is really running.  If not,
317	 * go through the slow path.
318	 */
319	if (cpu_datap(slot_num)->cpu_running)
320		return(KERN_SUCCESS);
321	else
322		return(KERN_FAILURE);
323}
324
325static void
326started_cpu(void)
327{
328	/* Here on the started cpu with cpu_running set TRUE */
329
330	if (TSC_sync_margin &&
331	    start_info.target_cpu == cpu_number()) {
332		/*
333		 * I've just started-up, synchronize again with the starter cpu
334		 * and then snap my TSC.
335		 */
336		tsc_target   = 0;
337		atomic_decl(&tsc_entry_barrier, 1);
338		while (tsc_entry_barrier != 0)
339			;	/* spin for starter and target at barrier */
340		tsc_target = rdtsc64();
341		atomic_decl(&tsc_exit_barrier, 1);
342	}
343}
344
345static void
346start_cpu(void *arg)
347{
348	int			i = 1000;
349	processor_start_info_t	*psip = (processor_start_info_t *) arg;
350
351	/* Ignore this if the current processor is not the starter */
352	if (cpu_number() != psip->starter_cpu)
353		return;
354
355	DBG("start_cpu(%p) about to start cpu %d, lapic %d\n",
356		arg, psip->target_cpu, psip->target_lapic);
357
358	KERNEL_DEBUG_CONSTANT(
359		TRACE_MP_CPU_START | DBG_FUNC_START,
360		psip->target_cpu,
361		psip->target_lapic, 0, 0, 0);
362
363	i386_start_cpu(psip->target_lapic, psip->target_cpu);
364
365#ifdef	POSTCODE_DELAY
366	/* Wait much longer if postcodes are displayed for a delay period. */
367	i *= 10000;
368#endif
369	DBG("start_cpu(%p) about to wait for cpu %d\n",
370		arg, psip->target_cpu);
371
372	mp_wait_for_cpu_up(psip->target_cpu, i*100, 100);
373
374	KERNEL_DEBUG_CONSTANT(
375		TRACE_MP_CPU_START | DBG_FUNC_END,
376		psip->target_cpu,
377		cpu_datap(psip->target_cpu)->cpu_running, 0, 0, 0);
378
379	if (TSC_sync_margin &&
380	    cpu_datap(psip->target_cpu)->cpu_running) {
381		/*
382		 * Compare the TSC from the started processor with ours.
383		 * Report and log/panic if it diverges by more than
384		 * TSC_sync_margin (TSC_SYNC_MARGIN) ticks. This margin
385		 * can be overriden by boot-arg (with 0 meaning no checking).
386		 */
387		uint64_t	tsc_starter;
388		int64_t		tsc_delta;
389		atomic_decl(&tsc_entry_barrier, 1);
390		while (tsc_entry_barrier != 0)
391			;	/* spin for both processors at barrier */
392		tsc_starter = rdtsc64();
393		atomic_decl(&tsc_exit_barrier, 1);
394		while (tsc_exit_barrier != 0)
395			;	/* spin for target to store its TSC */
396		tsc_delta = tsc_target - tsc_starter;
397		kprintf("TSC sync for cpu %d: 0x%016llx delta 0x%llx (%lld)\n",
398			psip->target_cpu, tsc_target, tsc_delta, tsc_delta);
399		if (ABS(tsc_delta) > (int64_t) TSC_sync_margin) {
400#if DEBUG
401			panic(
402#else
403			printf(
404#endif
405				"Unsynchronized  TSC for cpu %d: "
406					"0x%016llx, delta 0x%llx\n",
407				psip->target_cpu, tsc_target, tsc_delta);
408		}
409	}
410}
411
412kern_return_t
413intel_startCPU(
414	int	slot_num)
415{
416	int		lapic = cpu_to_lapic[slot_num];
417	boolean_t	istate;
418
419	assert(lapic != -1);
420
421	DBGLOG_CPU_INIT(slot_num);
422
423	DBG("intel_startCPU(%d) lapic_id=%d\n", slot_num, lapic);
424	DBG("IdlePTD(%p): 0x%x\n", &IdlePTD, (int) (uintptr_t)IdlePTD);
425
426	/*
427	 * Initialize (or re-initialize) the descriptor tables for this cpu.
428	 * Propagate processor mode to slave.
429	 */
430	if (cpu_mode_is64bit())
431		cpu_desc_init64(cpu_datap(slot_num));
432	else
433		cpu_desc_init(cpu_datap(slot_num));
434
435	/* Serialize use of the slave boot stack, etc. */
436	lck_mtx_lock(&mp_cpu_boot_lock);
437
438	istate = ml_set_interrupts_enabled(FALSE);
439	if (slot_num == get_cpu_number()) {
440		ml_set_interrupts_enabled(istate);
441		lck_mtx_unlock(&mp_cpu_boot_lock);
442		return KERN_SUCCESS;
443	}
444
445	start_info.starter_cpu  = cpu_number();
446	start_info.target_cpu   = slot_num;
447	start_info.target_lapic = lapic;
448	tsc_entry_barrier = 2;
449	tsc_exit_barrier = 2;
450
451	/*
452	 * Perform the processor startup sequence with all running
453	 * processors rendezvous'ed. This is required during periods when
454	 * the cache-disable bit is set for MTRR/PAT initialization.
455	 */
456	mp_rendezvous_no_intrs(start_cpu, (void *) &start_info);
457
458	start_info.target_cpu = 0;
459
460	ml_set_interrupts_enabled(istate);
461	lck_mtx_unlock(&mp_cpu_boot_lock);
462
463	if (!cpu_datap(slot_num)->cpu_running) {
464		kprintf("Failed to start CPU %02d\n", slot_num);
465		printf("Failed to start CPU %02d, rebooting...\n", slot_num);
466		delay(1000000);
467		halt_cpu();
468		return KERN_SUCCESS;
469	} else {
470		kprintf("Started cpu %d (lapic id %08x)\n", slot_num, lapic);
471		return KERN_SUCCESS;
472	}
473}
474
475#if	MP_DEBUG
476cpu_signal_event_log_t	*cpu_signal[MAX_CPUS];
477cpu_signal_event_log_t	*cpu_handle[MAX_CPUS];
478
479MP_EVENT_NAME_DECL();
480
481#endif	/* MP_DEBUG */
482
483int
484cpu_signal_handler(x86_saved_state_t *regs)
485{
486	int		my_cpu;
487	volatile int	*my_word;
488
489	SCHED_STATS_IPI(current_processor());
490
491	my_cpu = cpu_number();
492	my_word = &cpu_data_ptr[my_cpu]->cpu_signals;
493	/* Store the initial set of signals for diagnostics. New
494	 * signals could arrive while these are being processed
495	 * so it's no more than a hint.
496	 */
497
498	cpu_data_ptr[my_cpu]->cpu_prior_signals = *my_word;
499
500	do {
501#if	MACH_KDP
502		if (i_bit(MP_KDP, my_word)) {
503			DBGLOG(cpu_handle,my_cpu,MP_KDP);
504			i_bit_clear(MP_KDP, my_word);
505/* Ensure that the i386_kernel_state at the base of the
506 * current thread's stack (if any) is synchronized with the
507 * context at the moment of the interrupt, to facilitate
508 * access through the debugger.
509 */
510			sync_iss_to_iks(regs);
511			if (pmsafe_debug && !kdp_snapshot)
512				pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
513			mp_kdp_wait(TRUE, FALSE);
514			if (pmsafe_debug && !kdp_snapshot)
515				pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
516		} else
517#endif	/* MACH_KDP */
518		if (i_bit(MP_TLB_FLUSH, my_word)) {
519			DBGLOG(cpu_handle,my_cpu,MP_TLB_FLUSH);
520			i_bit_clear(MP_TLB_FLUSH, my_word);
521			pmap_update_interrupt();
522		} else if (i_bit(MP_AST, my_word)) {
523			DBGLOG(cpu_handle,my_cpu,MP_AST);
524			i_bit_clear(MP_AST, my_word);
525			ast_check(cpu_to_processor(my_cpu));
526		} else if (i_bit(MP_RENDEZVOUS, my_word)) {
527			DBGLOG(cpu_handle,my_cpu,MP_RENDEZVOUS);
528			i_bit_clear(MP_RENDEZVOUS, my_word);
529			mp_rendezvous_action();
530		} else if (i_bit(MP_BROADCAST, my_word)) {
531			DBGLOG(cpu_handle,my_cpu,MP_BROADCAST);
532			i_bit_clear(MP_BROADCAST, my_word);
533			mp_broadcast_action();
534		} else if (i_bit(MP_CHUD, my_word)) {
535			DBGLOG(cpu_handle,my_cpu,MP_CHUD);
536			i_bit_clear(MP_CHUD, my_word);
537			chudxnu_cpu_signal_handler();
538		} else if (i_bit(MP_CALL, my_word)) {
539			DBGLOG(cpu_handle,my_cpu,MP_CALL);
540			i_bit_clear(MP_CALL, my_word);
541			mp_cpus_call_action();
542		} else if (i_bit(MP_CALL_PM, my_word)) {
543			DBGLOG(cpu_handle,my_cpu,MP_CALL_PM);
544			i_bit_clear(MP_CALL_PM, my_word);
545			mp_call_PM();
546		}
547	} while (*my_word);
548
549	return 0;
550}
551
552static int
553NMIInterruptHandler(x86_saved_state_t *regs)
554{
555	void 	*stackptr;
556
557	if (panic_active() && !panicDebugging) {
558		if (pmsafe_debug)
559			pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
560		for(;;)
561			cpu_pause();
562	}
563
564	atomic_incl(&NMIPI_acks, 1);
565	sync_iss_to_iks_unconditionally(regs);
566#if defined (__i386__)
567	__asm__ volatile("movl %%ebp, %0" : "=m" (stackptr));
568#elif defined (__x86_64__)
569	__asm__ volatile("movq %%rbp, %0" : "=m" (stackptr));
570#endif
571
572	if (cpu_number() == debugger_cpu)
573			goto NMExit;
574
575	if (spinlock_timed_out) {
576		char pstr[192];
577		snprintf(&pstr[0], sizeof(pstr), "Panic(CPU %d): NMIPI for spinlock acquisition timeout, spinlock: %p, spinlock owner: %p, current_thread: %p, spinlock_owner_cpu: 0x%x\n", cpu_number(), spinlock_timed_out, (void *) spinlock_timed_out->interlock.lock_data, current_thread(), spinlock_owner_cpu);
578		panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
579	} else if (pmap_tlb_flush_timeout == TRUE) {
580		char pstr[128];
581		snprintf(&pstr[0], sizeof(pstr), "Panic(CPU %d): Unresponsive processor (this CPU did not acknowledge interrupts) TLB state:0x%x\n", cpu_number(), current_cpu_datap()->cpu_tlb_invalid);
582		panic_i386_backtrace(stackptr, 48, &pstr[0], TRUE, regs);
583	}
584
585#if MACH_KDP
586	if (pmsafe_debug && !kdp_snapshot)
587		pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
588	current_cpu_datap()->cpu_NMI_acknowledged = TRUE;
589	mp_kdp_wait(FALSE, pmap_tlb_flush_timeout || spinlock_timed_out || panic_active());
590	if (pmsafe_debug && !kdp_snapshot)
591		pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
592#endif
593NMExit:
594	return 1;
595}
596
597
598/*
599 * cpu_interrupt is really just to be used by the scheduler to
600 * get a CPU's attention it may not always issue an IPI.  If an
601 * IPI is always needed then use i386_cpu_IPI.
602 */
603void
604cpu_interrupt(int cpu)
605{
606	boolean_t did_IPI = FALSE;
607
608	if (smp_initialized
609	    && pmCPUExitIdle(cpu_datap(cpu))) {
610		i386_cpu_IPI(cpu);
611		did_IPI = TRUE;
612	}
613
614	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, did_IPI, 0, 0, 0);
615}
616
617/*
618 * Send a true NMI via the local APIC to the specified CPU.
619 */
620void
621cpu_NMI_interrupt(int cpu)
622{
623	if (smp_initialized) {
624		i386_send_NMI(cpu);
625	}
626}
627
628static void	(* volatile mp_PM_func)(void) = NULL;
629
630static void
631mp_call_PM(void)
632{
633	assert(!ml_get_interrupts_enabled());
634
635	if (mp_PM_func != NULL)
636		mp_PM_func();
637}
638
639void
640cpu_PM_interrupt(int cpu)
641{
642	assert(!ml_get_interrupts_enabled());
643
644	if (mp_PM_func != NULL) {
645		if (cpu == cpu_number())
646			mp_PM_func();
647		else
648			i386_signal_cpu(cpu, MP_CALL_PM, ASYNC);
649	}
650}
651
652void
653PM_interrupt_register(void (*fn)(void))
654{
655	mp_PM_func = fn;
656}
657
658void
659i386_signal_cpu(int cpu, mp_event_t event, mp_sync_t mode)
660{
661	volatile int	*signals = &cpu_datap(cpu)->cpu_signals;
662	uint64_t	tsc_timeout;
663
664
665	if (!cpu_datap(cpu)->cpu_running)
666		return;
667
668	if (event == MP_TLB_FLUSH)
669	        KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_START, cpu, 0, 0, 0, 0);
670
671	DBGLOG(cpu_signal, cpu, event);
672
673	i_bit_set(event, signals);
674	i386_cpu_IPI(cpu);
675	if (mode == SYNC) {
676	   again:
677		tsc_timeout = rdtsc64() + (1000*1000*1000);
678		while (i_bit(event, signals) && rdtsc64() < tsc_timeout) {
679			cpu_pause();
680		}
681		if (i_bit(event, signals)) {
682			DBG("i386_signal_cpu(%d, 0x%x, SYNC) timed out\n",
683				cpu, event);
684			goto again;
685		}
686	}
687	if (event == MP_TLB_FLUSH)
688	        KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_END, cpu, 0, 0, 0, 0);
689}
690
691/*
692 * Send event to all running cpus.
693 * Called with the topology locked.
694 */
695void
696i386_signal_cpus(mp_event_t event, mp_sync_t mode)
697{
698	unsigned int	cpu;
699	unsigned int	my_cpu = cpu_number();
700
701	assert(hw_lock_held((hw_lock_t)&x86_topo_lock));
702
703	for (cpu = 0; cpu < real_ncpus; cpu++) {
704		if (cpu == my_cpu || !cpu_datap(cpu)->cpu_running)
705			continue;
706		i386_signal_cpu(cpu, event, mode);
707	}
708}
709
710/*
711 * Return the number of running cpus.
712 * Called with the topology locked.
713 */
714int
715i386_active_cpus(void)
716{
717	unsigned int	cpu;
718	unsigned int	ncpus = 0;
719
720	assert(hw_lock_held((hw_lock_t)&x86_topo_lock));
721
722	for (cpu = 0; cpu < real_ncpus; cpu++) {
723		if (cpu_datap(cpu)->cpu_running)
724			ncpus++;
725	}
726	return(ncpus);
727}
728
729/*
730 * All-CPU rendezvous:
731 * 	- CPUs are signalled,
732 *	- all execute the setup function (if specified),
733 *	- rendezvous (i.e. all cpus reach a barrier),
734 *	- all execute the action function (if specified),
735 *	- rendezvous again,
736 *	- execute the teardown function (if specified), and then
737 *	- resume.
738 *
739 * Note that the supplied external functions _must_ be reentrant and aware
740 * that they are running in parallel and in an unknown lock context.
741 */
742
743static void
744mp_rendezvous_action(void)
745{
746	boolean_t intrs_enabled;
747
748	/* setup function */
749	if (mp_rv_setup_func != NULL)
750		mp_rv_setup_func(mp_rv_func_arg);
751
752	intrs_enabled = ml_get_interrupts_enabled();
753
754	/* spin on entry rendezvous */
755	atomic_incl(&mp_rv_entry, 1);
756	while (mp_rv_entry < mp_rv_ncpus) {
757		/* poll for pesky tlb flushes if interrupts disabled */
758		if (!intrs_enabled)
759			handle_pending_TLB_flushes();
760		cpu_pause();
761	}
762
763	/* action function */
764	if (mp_rv_action_func != NULL)
765		mp_rv_action_func(mp_rv_func_arg);
766
767	/* spin on exit rendezvous */
768	atomic_incl(&mp_rv_exit, 1);
769	while (mp_rv_exit < mp_rv_ncpus) {
770		if (!intrs_enabled)
771			handle_pending_TLB_flushes();
772		cpu_pause();
773	}
774
775	/* teardown function */
776	if (mp_rv_teardown_func != NULL)
777		mp_rv_teardown_func(mp_rv_func_arg);
778
779	/* Bump completion count */
780	atomic_incl(&mp_rv_complete, 1);
781}
782
783void
784mp_rendezvous(void (*setup_func)(void *),
785	      void (*action_func)(void *),
786	      void (*teardown_func)(void *),
787	      void *arg)
788{
789
790	if (!smp_initialized) {
791		if (setup_func != NULL)
792			setup_func(arg);
793		if (action_func != NULL)
794			action_func(arg);
795		if (teardown_func != NULL)
796			teardown_func(arg);
797		return;
798	}
799
800	/* obtain rendezvous lock */
801	simple_lock(&mp_rv_lock);
802
803	/* set static function pointers */
804	mp_rv_setup_func = setup_func;
805	mp_rv_action_func = action_func;
806	mp_rv_teardown_func = teardown_func;
807	mp_rv_func_arg = arg;
808
809	mp_rv_entry    = 0;
810	mp_rv_exit     = 0;
811	mp_rv_complete = 0;
812
813	/*
814	 * signal other processors, which will call mp_rendezvous_action()
815	 * with interrupts disabled
816	 */
817	simple_lock(&x86_topo_lock);
818	mp_rv_ncpus = i386_active_cpus();
819	i386_signal_cpus(MP_RENDEZVOUS, ASYNC);
820	simple_unlock(&x86_topo_lock);
821
822	/* call executor function on this cpu */
823	mp_rendezvous_action();
824
825	/*
826	 * Spin for everyone to complete.
827	 * This is necessary to ensure that all processors have proceeded
828	 * from the exit barrier before we release the rendezvous structure.
829	 */
830	while (mp_rv_complete < mp_rv_ncpus) {
831		cpu_pause();
832	}
833
834	/* Tidy up */
835	mp_rv_setup_func = NULL;
836	mp_rv_action_func = NULL;
837	mp_rv_teardown_func = NULL;
838	mp_rv_func_arg = NULL;
839
840	/* release lock */
841	simple_unlock(&mp_rv_lock);
842}
843
844void
845mp_rendezvous_break_lock(void)
846{
847	simple_lock_init(&mp_rv_lock, 0);
848}
849
850static void
851setup_disable_intrs(__unused void * param_not_used)
852{
853	/* disable interrupts before the first barrier */
854	boolean_t intr = ml_set_interrupts_enabled(FALSE);
855
856	current_cpu_datap()->cpu_iflag = intr;
857	DBG("CPU%d: %s\n", get_cpu_number(), __FUNCTION__);
858}
859
860static void
861teardown_restore_intrs(__unused void * param_not_used)
862{
863	/* restore interrupt flag following MTRR changes */
864	ml_set_interrupts_enabled(current_cpu_datap()->cpu_iflag);
865	DBG("CPU%d: %s\n", get_cpu_number(), __FUNCTION__);
866}
867
868/*
869 * A wrapper to mp_rendezvous() to call action_func() with interrupts disabled.
870 * This is exported for use by kexts.
871 */
872void
873mp_rendezvous_no_intrs(
874	      void (*action_func)(void *),
875	      void *arg)
876{
877	mp_rendezvous(setup_disable_intrs,
878		      action_func,
879		      teardown_restore_intrs,
880		      arg);
881}
882
883
884typedef struct {
885	queue_chain_t	link;			/* queue linkage */
886	void		(*func)(void *,void *);	/* routine to call */
887	void		*arg0;			/* routine's 1st arg */
888	void		*arg1;			/* routine's 2nd arg */
889	volatile long	*countp;		/* completion counter */
890} mp_call_t;
891
892
893typedef struct {
894	queue_head_t		queue;
895	decl_simple_lock_data(,	lock);
896} mp_call_queue_t;
897#define MP_CPUS_CALL_BUFS_PER_CPU	MAX_CPUS
898static mp_call_queue_t	mp_cpus_call_freelist;
899static mp_call_queue_t	mp_cpus_call_head[MAX_CPUS];
900
901static inline boolean_t
902mp_call_head_lock(mp_call_queue_t *cqp)
903{
904	boolean_t	intrs_enabled;
905
906	intrs_enabled = ml_set_interrupts_enabled(FALSE);
907	simple_lock(&cqp->lock);
908
909	return intrs_enabled;
910}
911
912static inline boolean_t
913mp_call_head_is_locked(mp_call_queue_t *cqp)
914{
915	return !ml_get_interrupts_enabled() &&
916		hw_lock_held((hw_lock_t)&cqp->lock);
917}
918
919static inline void
920mp_call_head_unlock(mp_call_queue_t *cqp, boolean_t intrs_enabled)
921{
922	simple_unlock(&cqp->lock);
923	ml_set_interrupts_enabled(intrs_enabled);
924}
925
926static inline mp_call_t *
927mp_call_alloc(void)
928{
929	mp_call_t	*callp = NULL;
930	boolean_t	intrs_enabled;
931	mp_call_queue_t	*cqp = &mp_cpus_call_freelist;
932
933	intrs_enabled = mp_call_head_lock(cqp);
934	if (!queue_empty(&cqp->queue))
935		queue_remove_first(&cqp->queue, callp, typeof(callp), link);
936	mp_call_head_unlock(cqp, intrs_enabled);
937
938	return callp;
939}
940
941static inline void
942mp_call_free(mp_call_t *callp)
943{
944	boolean_t	intrs_enabled;
945	mp_call_queue_t	*cqp = &mp_cpus_call_freelist;
946
947	intrs_enabled = mp_call_head_lock(cqp);
948	queue_enter_first(&cqp->queue, callp, typeof(callp), link);
949	mp_call_head_unlock(cqp, intrs_enabled);
950}
951
952static inline mp_call_t *
953mp_call_dequeue_locked(mp_call_queue_t *cqp)
954{
955	mp_call_t	*callp = NULL;
956
957	assert(mp_call_head_is_locked(cqp));
958	if (!queue_empty(&cqp->queue))
959		queue_remove_first(&cqp->queue, callp, typeof(callp), link);
960	return callp;
961}
962
963static inline void
964mp_call_enqueue_locked(
965	mp_call_queue_t	*cqp,
966	mp_call_t	*callp)
967{
968	queue_enter(&cqp->queue, callp, typeof(callp), link);
969}
970
971/* Called on the boot processor to initialize global structures */
972static void
973mp_cpus_call_init(void)
974{
975	mp_call_queue_t	*cqp = &mp_cpus_call_freelist;
976
977	DBG("mp_cpus_call_init()\n");
978	simple_lock_init(&cqp->lock, 0);
979	queue_init(&cqp->queue);
980}
981
982/*
983 * Called by each processor to add call buffers to the free list
984 * and to initialize the per-cpu call queue.
985 * Also called but ignored on slave processors on re-start/wake.
986 */
987static void
988mp_cpus_call_cpu_init(void)
989{
990	int		i;
991	mp_call_queue_t	*cqp = &mp_cpus_call_head[cpu_number()];
992	mp_call_t	*callp;
993
994	if (cqp->queue.next != NULL)
995		return; /* restart/wake case: called already */
996
997	simple_lock_init(&cqp->lock, 0);
998	queue_init(&cqp->queue);
999	for (i = 0; i < MP_CPUS_CALL_BUFS_PER_CPU; i++) {
1000		callp = (mp_call_t *) kalloc(sizeof(mp_call_t));
1001		mp_call_free(callp);
1002	}
1003
1004	DBG("mp_cpus_call_init() done on cpu %d\n", cpu_number());
1005}
1006
1007/*
1008 * This is called from cpu_signal_handler() to process an MP_CALL signal.
1009 * And also from i386_deactivate_cpu() when a cpu is being taken offline.
1010 */
1011static void
1012mp_cpus_call_action(void)
1013{
1014	mp_call_queue_t	*cqp;
1015	boolean_t	intrs_enabled;
1016	mp_call_t	*callp;
1017	mp_call_t	call;
1018
1019	assert(!ml_get_interrupts_enabled());
1020	cqp = &mp_cpus_call_head[cpu_number()];
1021	intrs_enabled = mp_call_head_lock(cqp);
1022	while ((callp = mp_call_dequeue_locked(cqp)) != NULL) {
1023		/* Copy call request to the stack to free buffer */
1024		call = *callp;
1025		mp_call_free(callp);
1026		if (call.func != NULL) {
1027			mp_call_head_unlock(cqp, intrs_enabled);
1028			KERNEL_DEBUG_CONSTANT(
1029				TRACE_MP_CPUS_CALL_ACTION,
1030				call.func, call.arg0, call.arg1, call.countp, 0);
1031			call.func(call.arg0, call.arg1);
1032			(void) mp_call_head_lock(cqp);
1033		}
1034		if (call.countp != NULL)
1035			atomic_incl(call.countp, 1);
1036	}
1037	mp_call_head_unlock(cqp, intrs_enabled);
1038}
1039
1040/*
1041 * mp_cpus_call() runs a given function on cpus specified in a given cpu mask.
1042 * Possible modes are:
1043 *  SYNC:   function is called serially on target cpus in logical cpu order
1044 *	    waiting for each call to be acknowledged before proceeding
1045 *  ASYNC:  function call is queued to the specified cpus
1046 *	    waiting for all calls to complete in parallel before returning
1047 *  NOSYNC: function calls are queued
1048 *	    but we return before confirmation of calls completing.
1049 * The action function may be NULL.
1050 * The cpu mask may include the local cpu. Offline cpus are ignored.
1051 * The return value is the number of cpus on which the call was made or queued.
1052 */
1053cpu_t
1054mp_cpus_call(
1055	cpumask_t	cpus,
1056	mp_sync_t	mode,
1057        void		(*action_func)(void *),
1058        void		*arg)
1059{
1060	return mp_cpus_call1(
1061			cpus,
1062			mode,
1063			(void (*)(void *,void *))action_func,
1064			arg,
1065			NULL,
1066			NULL,
1067			NULL);
1068}
1069
1070static void
1071mp_cpus_call_wait(boolean_t	intrs_enabled,
1072		  long		mp_cpus_signals,
1073		  volatile long	*mp_cpus_calls)
1074{
1075	mp_call_queue_t		*cqp;
1076
1077	cqp = &mp_cpus_call_head[cpu_number()];
1078
1079	while (*mp_cpus_calls < mp_cpus_signals) {
1080		if (!intrs_enabled) {
1081			/* Sniffing w/o locking */
1082			if (!queue_empty(&cqp->queue))
1083				mp_cpus_call_action();
1084			handle_pending_TLB_flushes();
1085		}
1086		cpu_pause();
1087	}
1088}
1089
1090cpu_t
1091mp_cpus_call1(
1092	cpumask_t	cpus,
1093	mp_sync_t	mode,
1094        void		(*action_func)(void *, void *),
1095        void		*arg0,
1096        void		*arg1,
1097	cpumask_t	*cpus_calledp,
1098	cpumask_t	*cpus_notcalledp)
1099{
1100	cpu_t		cpu;
1101	boolean_t	intrs_enabled = FALSE;
1102	boolean_t	call_self = FALSE;
1103	cpumask_t	cpus_called = 0;
1104	cpumask_t	cpus_notcalled = 0;
1105	long 		mp_cpus_signals = 0;
1106	volatile long	mp_cpus_calls = 0;
1107
1108	KERNEL_DEBUG_CONSTANT(
1109		TRACE_MP_CPUS_CALL | DBG_FUNC_START,
1110		cpus, mode, VM_KERNEL_UNSLIDE(action_func), arg0, arg1);
1111
1112	if (!smp_initialized) {
1113		if ((cpus & CPUMASK_SELF) == 0)
1114			goto out;
1115		if (action_func != NULL) {
1116			intrs_enabled = ml_set_interrupts_enabled(FALSE);
1117			action_func(arg0, arg1);
1118			ml_set_interrupts_enabled(intrs_enabled);
1119		}
1120		call_self = TRUE;
1121		goto out;
1122	}
1123
1124	/*
1125	 * Queue the call for each non-local requested cpu.
1126	 * The topo lock is not taken. Instead we sniff the cpu_running state
1127	 * and then re-check it after taking the call lock. A cpu being taken
1128	 * offline runs the action function after clearing the cpu_running.
1129	 */
1130	for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) {
1131		if (((cpu_to_cpumask(cpu) & cpus) == 0) ||
1132		    !cpu_datap(cpu)->cpu_running)
1133			continue;
1134		if (cpu == (cpu_t) cpu_number()) {
1135			/*
1136			 * We don't IPI ourself and if calling asynchronously,
1137			 * we defer our call until we have signalled all others.
1138			 */
1139			call_self = TRUE;
1140			cpus_called |= cpu_to_cpumask(cpu);
1141			if (mode == SYNC && action_func != NULL) {
1142				KERNEL_DEBUG_CONSTANT(
1143					TRACE_MP_CPUS_CALL_LOCAL,
1144					VM_KERNEL_UNSLIDE(action_func),
1145					arg0, arg1, 0, 0);
1146				action_func(arg0, arg1);
1147			}
1148		} else {
1149			/*
1150			 * Here to queue a call to cpu and IPI.
1151			 * Spinning for request buffer unless NOSYNC.
1152			 */
1153			mp_call_t	*callp = NULL;
1154			mp_call_queue_t	*cqp = &mp_cpus_call_head[cpu];
1155
1156		queue_call:
1157			if (callp == NULL)
1158				callp = mp_call_alloc();
1159			intrs_enabled = mp_call_head_lock(cqp);
1160			if (!cpu_datap(cpu)->cpu_running) {
1161				mp_call_head_unlock(cqp, intrs_enabled);
1162				continue;
1163			}
1164			if (mode == NOSYNC) {
1165				if (callp == NULL) {
1166					cpus_notcalled |= cpu_to_cpumask(cpu);
1167					mp_call_head_unlock(cqp, intrs_enabled);
1168					KERNEL_DEBUG_CONSTANT(
1169						TRACE_MP_CPUS_CALL_NOBUF,
1170						cpu, 0, 0, 0, 0);
1171					continue;
1172				}
1173				callp->countp = NULL;
1174			} else {
1175				if (callp == NULL) {
1176					mp_call_head_unlock(cqp, intrs_enabled);
1177					KERNEL_DEBUG_CONSTANT(
1178						TRACE_MP_CPUS_CALL_NOBUF,
1179						cpu, 0, 0, 0, 0);
1180					if (!intrs_enabled) {
1181						/* Sniffing w/o locking */
1182						if (!queue_empty(&cqp->queue))
1183							mp_cpus_call_action();
1184						handle_pending_TLB_flushes();
1185					}
1186					cpu_pause();
1187					goto queue_call;
1188				}
1189				callp->countp = &mp_cpus_calls;
1190			}
1191			callp->func = action_func;
1192			callp->arg0 = arg0;
1193			callp->arg1 = arg1;
1194			mp_call_enqueue_locked(cqp, callp);
1195			mp_cpus_signals++;
1196			cpus_called |= cpu_to_cpumask(cpu);
1197			i386_signal_cpu(cpu, MP_CALL, ASYNC);
1198			mp_call_head_unlock(cqp, intrs_enabled);
1199			if (mode == SYNC) {
1200				mp_cpus_call_wait(intrs_enabled, mp_cpus_signals, &mp_cpus_calls);
1201			}
1202		}
1203	}
1204
1205	/* Call locally if mode not SYNC */
1206	if (mode != SYNC && call_self ) {
1207		KERNEL_DEBUG_CONSTANT(
1208			TRACE_MP_CPUS_CALL_LOCAL,
1209			VM_KERNEL_UNSLIDE(action_func), arg0, arg1, 0, 0);
1210		if (action_func != NULL) {
1211			ml_set_interrupts_enabled(FALSE);
1212			action_func(arg0, arg1);
1213			ml_set_interrupts_enabled(intrs_enabled);
1214		}
1215	}
1216
1217	/* For ASYNC, now wait for all signaled cpus to complete their calls */
1218	if (mode == ASYNC) {
1219		mp_cpus_call_wait(intrs_enabled, mp_cpus_signals, &mp_cpus_calls);
1220	}
1221
1222out:
1223	cpu = (cpu_t) mp_cpus_signals + (call_self ? 1 : 0);
1224
1225	if (cpus_calledp)
1226		*cpus_calledp = cpus_called;
1227	if (cpus_notcalledp)
1228		*cpus_notcalledp = cpus_notcalled;
1229
1230	KERNEL_DEBUG_CONSTANT(
1231		TRACE_MP_CPUS_CALL | DBG_FUNC_END,
1232		cpu, cpus_called, cpus_notcalled, 0, 0);
1233
1234	return cpu;
1235}
1236
1237
1238static void
1239mp_broadcast_action(void)
1240{
1241   /* call action function */
1242   if (mp_bc_action_func != NULL)
1243       mp_bc_action_func(mp_bc_func_arg);
1244
1245   /* if we're the last one through, wake up the instigator */
1246   if (atomic_decl_and_test(&mp_bc_count, 1))
1247       thread_wakeup(((event_t)(uintptr_t) &mp_bc_count));
1248}
1249
1250/*
1251 * mp_broadcast() runs a given function on all active cpus.
1252 * The caller blocks until the functions has run on all cpus.
1253 * The caller will also block if there is another pending braodcast.
1254 */
1255void
1256mp_broadcast(
1257         void (*action_func)(void *),
1258         void *arg)
1259{
1260   if (!smp_initialized) {
1261       if (action_func != NULL)
1262	           action_func(arg);
1263       return;
1264   }
1265
1266   /* obtain broadcast lock */
1267   lck_mtx_lock(&mp_bc_lock);
1268
1269   /* set static function pointers */
1270   mp_bc_action_func = action_func;
1271   mp_bc_func_arg = arg;
1272
1273   assert_wait((event_t)(uintptr_t)&mp_bc_count, THREAD_UNINT);
1274
1275   /*
1276    * signal other processors, which will call mp_broadcast_action()
1277    */
1278   simple_lock(&x86_topo_lock);
1279   mp_bc_ncpus = i386_active_cpus();   /* total including this cpu */
1280   mp_bc_count = mp_bc_ncpus;
1281   i386_signal_cpus(MP_BROADCAST, ASYNC);
1282
1283   /* call executor function on this cpu */
1284   mp_broadcast_action();
1285   simple_unlock(&x86_topo_lock);
1286
1287   /* block for all cpus to have run action_func */
1288   if (mp_bc_ncpus > 1)
1289       thread_block(THREAD_CONTINUE_NULL);
1290   else
1291       clear_wait(current_thread(), THREAD_AWAKENED);
1292
1293   /* release lock */
1294   lck_mtx_unlock(&mp_bc_lock);
1295}
1296
1297void
1298i386_activate_cpu(void)
1299{
1300	cpu_data_t	*cdp = current_cpu_datap();
1301
1302	assert(!ml_get_interrupts_enabled());
1303
1304	if (!smp_initialized) {
1305		cdp->cpu_running = TRUE;
1306		return;
1307	}
1308
1309	simple_lock(&x86_topo_lock);
1310	cdp->cpu_running = TRUE;
1311	started_cpu();
1312	simple_unlock(&x86_topo_lock);
1313	flush_tlb_raw();
1314}
1315
1316extern void etimer_timer_expire(void	*arg);
1317
1318void
1319i386_deactivate_cpu(void)
1320{
1321	cpu_data_t	*cdp = current_cpu_datap();
1322
1323	assert(!ml_get_interrupts_enabled());
1324
1325	KERNEL_DEBUG_CONSTANT(
1326		TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_START,
1327		0, 0, 0, 0, 0);
1328
1329	simple_lock(&x86_topo_lock);
1330	cdp->cpu_running = FALSE;
1331	simple_unlock(&x86_topo_lock);
1332
1333	/*
1334	 * Move all of this cpu's timers to the master/boot cpu,
1335	 * and poke it in case there's a sooner deadline for it to schedule.
1336	 */
1337	timer_queue_shutdown(&cdp->rtclock_timer.queue);
1338	mp_cpus_call(cpu_to_cpumask(master_cpu), ASYNC, etimer_timer_expire, NULL);
1339
1340	/*
1341	 * Open an interrupt window
1342	 * and ensure any pending IPI or timer is serviced
1343	 */
1344	mp_disable_preemption();
1345	ml_set_interrupts_enabled(TRUE);
1346
1347	while (cdp->cpu_signals && x86_lcpu()->rtcDeadline != EndOfAllTime)
1348		cpu_pause();
1349	/*
1350	 * Ensure there's no remaining timer deadline set
1351	 * - AICPM may have left one active.
1352	 */
1353	setPop(0);
1354
1355	ml_set_interrupts_enabled(FALSE);
1356	mp_enable_preemption();
1357
1358	KERNEL_DEBUG_CONSTANT(
1359		TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_END,
1360		0, 0, 0, 0, 0);
1361}
1362
1363int	pmsafe_debug	= 1;
1364
1365#if	MACH_KDP
1366volatile boolean_t	mp_kdp_trap = FALSE;
1367volatile unsigned long	mp_kdp_ncpus;
1368boolean_t		mp_kdp_state;
1369
1370
1371void
1372mp_kdp_enter(void)
1373{
1374	unsigned int	cpu;
1375	unsigned int	ncpus = 0;
1376	unsigned int	my_cpu;
1377	uint64_t	tsc_timeout;
1378
1379	DBG("mp_kdp_enter()\n");
1380
1381	/*
1382	 * Here to enter the debugger.
1383	 * In case of races, only one cpu is allowed to enter kdp after
1384	 * stopping others.
1385	 */
1386	mp_kdp_state = ml_set_interrupts_enabled(FALSE);
1387	my_cpu = cpu_number();
1388
1389	if (my_cpu == (unsigned) debugger_cpu) {
1390		kprintf("\n\nRECURSIVE DEBUGGER ENTRY DETECTED\n\n");
1391		kdp_reset();
1392		return;
1393	}
1394
1395	cpu_datap(my_cpu)->debugger_entry_time = mach_absolute_time();
1396	simple_lock(&mp_kdp_lock);
1397
1398	if (pmsafe_debug && !kdp_snapshot)
1399	    pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
1400
1401	while (mp_kdp_trap) {
1402		simple_unlock(&mp_kdp_lock);
1403		DBG("mp_kdp_enter() race lost\n");
1404#if MACH_KDP
1405		mp_kdp_wait(TRUE, FALSE);
1406#endif
1407		simple_lock(&mp_kdp_lock);
1408	}
1409	debugger_cpu = my_cpu;
1410	ncpus = 1;
1411	mp_kdp_ncpus = 1;	/* self */
1412	mp_kdp_trap = TRUE;
1413	debugger_entry_time = cpu_datap(my_cpu)->debugger_entry_time;
1414	simple_unlock(&mp_kdp_lock);
1415
1416	/*
1417	 * Deliver a nudge to other cpus, counting how many
1418	 */
1419	DBG("mp_kdp_enter() signaling other processors\n");
1420	if (force_immediate_debugger_NMI == FALSE) {
1421		for (cpu = 0; cpu < real_ncpus; cpu++) {
1422			if (cpu == my_cpu || !cpu_datap(cpu)->cpu_running)
1423				continue;
1424			ncpus++;
1425			i386_signal_cpu(cpu, MP_KDP, ASYNC);
1426		}
1427		/*
1428		 * Wait other processors to synchronize
1429		 */
1430		DBG("mp_kdp_enter() waiting for (%d) processors to suspend\n", ncpus);
1431
1432		/*
1433		 * This timeout is rather arbitrary; we don't want to NMI
1434		 * processors that are executing at potentially
1435		 * "unsafe-to-interrupt" points such as the trampolines,
1436		 * but neither do we want to lose state by waiting too long.
1437		 */
1438		tsc_timeout = rdtsc64() + (ncpus * 1000 * 1000 * 10ULL);
1439
1440		if (virtualized)
1441			tsc_timeout = ~0ULL;
1442
1443		while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) {
1444			/*
1445			 * A TLB shootdown request may be pending--this would
1446			 * result in the requesting processor waiting in
1447			 * PMAP_UPDATE_TLBS() until this processor deals with it.
1448			 * Process it, so it can now enter mp_kdp_wait()
1449			 */
1450			handle_pending_TLB_flushes();
1451			cpu_pause();
1452		}
1453		/* If we've timed out, and some processor(s) are still unresponsive,
1454		 * interrupt them with an NMI via the local APIC.
1455		 */
1456		if (mp_kdp_ncpus != ncpus) {
1457			for (cpu = 0; cpu < real_ncpus; cpu++) {
1458				if (cpu == my_cpu || !cpu_datap(cpu)->cpu_running)
1459					continue;
1460				if (cpu_signal_pending(cpu, MP_KDP))
1461					cpu_NMI_interrupt(cpu);
1462			}
1463		}
1464	}
1465	else
1466		for (cpu = 0; cpu < real_ncpus; cpu++) {
1467			if (cpu == my_cpu || !cpu_datap(cpu)->cpu_running)
1468				continue;
1469			cpu_NMI_interrupt(cpu);
1470		}
1471
1472	DBG("mp_kdp_enter() %d processors done %s\n",
1473	    (int)mp_kdp_ncpus, (mp_kdp_ncpus == ncpus) ? "OK" : "timed out");
1474
1475	postcode(MP_KDP_ENTER);
1476}
1477
1478static boolean_t
1479cpu_signal_pending(int cpu, mp_event_t event)
1480{
1481	volatile int	*signals = &cpu_datap(cpu)->cpu_signals;
1482	boolean_t retval = FALSE;
1483
1484	if (i_bit(event, signals))
1485		retval = TRUE;
1486	return retval;
1487}
1488
1489long kdp_x86_xcpu_invoke(const uint16_t lcpu, kdp_x86_xcpu_func_t func,
1490			 void *arg0, void *arg1)
1491{
1492	if (lcpu > (real_ncpus - 1))
1493		return -1;
1494
1495        if (func == NULL)
1496		return -1;
1497
1498	kdp_xcpu_call_func.func = func;
1499        kdp_xcpu_call_func.ret  = -1;
1500	kdp_xcpu_call_func.arg0 = arg0;
1501	kdp_xcpu_call_func.arg1 = arg1;
1502	kdp_xcpu_call_func.cpu  = lcpu;
1503	DBG("Invoking function %p on CPU %d\n", func, (int32_t)lcpu);
1504	while (kdp_xcpu_call_func.cpu != KDP_XCPU_NONE)
1505		cpu_pause();
1506        return kdp_xcpu_call_func.ret;
1507}
1508
1509static void
1510kdp_x86_xcpu_poll(void)
1511{
1512	if ((uint16_t)cpu_number() == kdp_xcpu_call_func.cpu) {
1513            kdp_xcpu_call_func.ret =
1514		    kdp_xcpu_call_func.func(kdp_xcpu_call_func.arg0,
1515					    kdp_xcpu_call_func.arg1,
1516					    cpu_number());
1517		kdp_xcpu_call_func.cpu = KDP_XCPU_NONE;
1518	}
1519}
1520
1521static void
1522mp_kdp_wait(boolean_t flush, boolean_t isNMI)
1523{
1524	DBG("mp_kdp_wait()\n");
1525	/* If an I/O port has been specified as a debugging aid, issue a read */
1526	panic_io_port_read();
1527	current_cpu_datap()->debugger_ipi_time = mach_absolute_time();
1528#if CONFIG_MCA
1529	/* If we've trapped due to a machine-check, save MCA registers */
1530	mca_check_save();
1531#endif
1532
1533	atomic_incl((volatile long *)&mp_kdp_ncpus, 1);
1534	while (mp_kdp_trap || (isNMI == TRUE)) {
1535	        /*
1536		 * A TLB shootdown request may be pending--this would result
1537		 * in the requesting processor waiting in PMAP_UPDATE_TLBS()
1538		 * until this processor handles it.
1539		 * Process it, so it can now enter mp_kdp_wait()
1540		 */
1541		if (flush)
1542			handle_pending_TLB_flushes();
1543
1544		kdp_x86_xcpu_poll();
1545		cpu_pause();
1546	}
1547
1548	atomic_decl((volatile long *)&mp_kdp_ncpus, 1);
1549	DBG("mp_kdp_wait() done\n");
1550}
1551
1552void
1553mp_kdp_exit(void)
1554{
1555	DBG("mp_kdp_exit()\n");
1556	debugger_cpu = -1;
1557	atomic_decl((volatile long *)&mp_kdp_ncpus, 1);
1558
1559	debugger_exit_time = mach_absolute_time();
1560
1561	mp_kdp_trap = FALSE;
1562	__asm__ volatile("mfence");
1563
1564	/* Wait other processors to stop spinning. XXX needs timeout */
1565	DBG("mp_kdp_exit() waiting for processors to resume\n");
1566	while (mp_kdp_ncpus > 0) {
1567	        /*
1568		 * a TLB shootdown request may be pending... this would result in the requesting
1569		 * processor waiting in PMAP_UPDATE_TLBS() until this processor deals with it.
1570		 * Process it, so it can now enter mp_kdp_wait()
1571		 */
1572	        handle_pending_TLB_flushes();
1573
1574		cpu_pause();
1575	}
1576
1577	if (pmsafe_debug && !kdp_snapshot)
1578	    pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
1579
1580	debugger_exit_time = mach_absolute_time();
1581
1582	DBG("mp_kdp_exit() done\n");
1583	(void) ml_set_interrupts_enabled(mp_kdp_state);
1584	postcode(0);
1585}
1586#endif	/* MACH_KDP */
1587
1588boolean_t
1589mp_recent_debugger_activity() {
1590	uint64_t abstime = mach_absolute_time();
1591	return (((abstime - debugger_entry_time) < LastDebuggerEntryAllowance) ||
1592	    ((abstime - debugger_exit_time) < LastDebuggerEntryAllowance));
1593}
1594
1595/*ARGSUSED*/
1596void
1597init_ast_check(
1598	__unused processor_t	processor)
1599{
1600}
1601
1602void
1603cause_ast_check(
1604	processor_t	processor)
1605{
1606	int	cpu = processor->cpu_id;
1607
1608	if (cpu != cpu_number()) {
1609		i386_signal_cpu(cpu, MP_AST, ASYNC);
1610		KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, 1, 0, 0, 0);
1611	}
1612}
1613
1614void
1615slave_machine_init(void *param)
1616{
1617	/*
1618 	 * Here in process context, but with interrupts disabled.
1619	 */
1620	DBG("slave_machine_init() CPU%d\n", get_cpu_number());
1621
1622	if (param == FULL_SLAVE_INIT) {
1623		/*
1624		 * Cold start
1625		 */
1626		clock_init();
1627		cpu_machine_init();	/* Interrupts enabled hereafter */
1628		mp_cpus_call_cpu_init();
1629	} else {
1630		cpu_machine_init();	/* Interrupts enabled hereafter */
1631	}
1632}
1633
1634#undef cpu_number
1635int cpu_number(void)
1636{
1637	return get_cpu_number();
1638}
1639
1640static void
1641cpu_prewarm_init()
1642{
1643	int i;
1644
1645	simple_lock_init(&cpu_warm_lock, 0);
1646	queue_init(&cpu_warm_call_list);
1647	for (i = 0; i < NUM_CPU_WARM_CALLS; i++) {
1648		enqueue_head(&cpu_warm_call_list, (queue_entry_t)&cpu_warm_call_arr[i]);
1649	}
1650}
1651
1652static timer_call_t
1653grab_warm_timer_call()
1654{
1655	spl_t x;
1656	timer_call_t call = NULL;
1657
1658	x = splsched();
1659	simple_lock(&cpu_warm_lock);
1660	if (!queue_empty(&cpu_warm_call_list)) {
1661		call = (timer_call_t) dequeue_head(&cpu_warm_call_list);
1662	}
1663	simple_unlock(&cpu_warm_lock);
1664	splx(x);
1665
1666	return call;
1667}
1668
1669static void
1670free_warm_timer_call(timer_call_t call)
1671{
1672	spl_t x;
1673
1674	x = splsched();
1675	simple_lock(&cpu_warm_lock);
1676	enqueue_head(&cpu_warm_call_list, (queue_entry_t)call);
1677	simple_unlock(&cpu_warm_lock);
1678	splx(x);
1679}
1680
1681/*
1682 * Runs in timer call context (interrupts disabled).
1683 */
1684static void
1685cpu_warm_timer_call_func(
1686		call_entry_param_t p0,
1687		__unused call_entry_param_t p1)
1688{
1689	free_warm_timer_call((timer_call_t)p0);
1690	return;
1691}
1692
1693/*
1694 * Runs with interrupts disabled on the CPU we wish to warm (i.e. CPU 0).
1695 */
1696static void
1697_cpu_warm_setup(
1698		void *arg)
1699{
1700	cpu_warm_data_t cwdp = (cpu_warm_data_t)arg;
1701
1702	timer_call_enter(cwdp->cwd_call, cwdp->cwd_deadline, TIMER_CALL_CRITICAL | TIMER_CALL_LOCAL);
1703	cwdp->cwd_result = 0;
1704
1705	return;
1706}
1707
1708/*
1709 * Not safe to call with interrupts disabled.
1710 */
1711kern_return_t
1712ml_interrupt_prewarm(
1713	uint64_t 	deadline)
1714{
1715	struct cpu_warm_data cwd;
1716	timer_call_t call;
1717	cpu_t ct;
1718
1719	if (ml_get_interrupts_enabled() == FALSE) {
1720		panic("%s: Interrupts disabled?\n", __FUNCTION__);
1721	}
1722
1723	/*
1724	 * If the platform doesn't need our help, say that we succeeded.
1725	 */
1726	if (!ml_get_interrupt_prewake_applicable()) {
1727		return KERN_SUCCESS;
1728	}
1729
1730	/*
1731	 * Grab a timer call to use.
1732	 */
1733	call = grab_warm_timer_call();
1734	if (call == NULL) {
1735		return KERN_RESOURCE_SHORTAGE;
1736	}
1737
1738	timer_call_setup(call, cpu_warm_timer_call_func, call);
1739	cwd.cwd_call = call;
1740	cwd.cwd_deadline = deadline;
1741	cwd.cwd_result = 0;
1742
1743	/*
1744	 * For now, non-local interrupts happen on the master processor.
1745	 */
1746	ct = mp_cpus_call(cpu_to_cpumask(master_cpu), SYNC, _cpu_warm_setup, &cwd);
1747	if (ct == 0) {
1748		free_warm_timer_call(call);
1749		return KERN_FAILURE;
1750	} else {
1751		return cwd.cwd_result;
1752	}
1753}
1754