1/*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31
32#include <mach_rt.h>
33#include <mach_kdp.h>
34#include <mach_ldebug.h>
35#include <gprof.h>
36
37#include <mach/mach_types.h>
38#include <mach/kern_return.h>
39
40#include <kern/kern_types.h>
41#include <kern/startup.h>
42#include <kern/timer_queue.h>
43#include <kern/processor.h>
44#include <kern/cpu_number.h>
45#include <kern/cpu_data.h>
46#include <kern/assert.h>
47#include <kern/machine.h>
48#include <kern/pms.h>
49#include <kern/misc_protos.h>
50#include <kern/timer_call.h>
51#include <kern/kalloc.h>
52#include <kern/queue.h>
53#include <prng/random.h>
54
55#include <vm/vm_map.h>
56#include <vm/vm_kern.h>
57
58#include <profiling/profile-mk.h>
59
60#include <i386/bit_routines.h>
61#include <i386/proc_reg.h>
62#include <i386/cpu_threads.h>
63#include <i386/mp_desc.h>
64#include <i386/misc_protos.h>
65#include <i386/trap.h>
66#include <i386/postcode.h>
67#include <i386/machine_routines.h>
68#include <i386/mp.h>
69#include <i386/mp_events.h>
70#include <i386/lapic.h>
71#include <i386/cpuid.h>
72#include <i386/fpu.h>
73#include <i386/machine_cpu.h>
74#include <i386/pmCPU.h>
75#if CONFIG_MCA
76#include <i386/machine_check.h>
77#endif
78#include <i386/acpi.h>
79
80#include <chud/chud_xnu.h>
81#include <chud/chud_xnu_private.h>
82
83#include <sys/kdebug.h>
84
85#include <console/serial_protos.h>
86
87#if	MP_DEBUG
88#define PAUSE		delay(1000000)
89#define DBG(x...)	kprintf(x)
90#else
91#define DBG(x...)
92#define PAUSE
93#endif	/* MP_DEBUG */
94
95/* Debugging/test trace events: */
96#define	TRACE_MP_TLB_FLUSH		MACHDBG_CODE(DBG_MACH_MP, 0)
97#define	TRACE_MP_CPUS_CALL		MACHDBG_CODE(DBG_MACH_MP, 1)
98#define	TRACE_MP_CPUS_CALL_LOCAL	MACHDBG_CODE(DBG_MACH_MP, 2)
99#define	TRACE_MP_CPUS_CALL_ACTION	MACHDBG_CODE(DBG_MACH_MP, 3)
100#define	TRACE_MP_CPUS_CALL_NOBUF	MACHDBG_CODE(DBG_MACH_MP, 4)
101#define	TRACE_MP_CPU_FAST_START		MACHDBG_CODE(DBG_MACH_MP, 5)
102#define	TRACE_MP_CPU_START		MACHDBG_CODE(DBG_MACH_MP, 6)
103#define	TRACE_MP_CPU_DEACTIVATE		MACHDBG_CODE(DBG_MACH_MP, 7)
104
105#define ABS(v)		(((v) > 0)?(v):-(v))
106
107void 		slave_boot_init(void);
108void		i386_cpu_IPI(int cpu);
109
110#if MACH_KDP
111static void	mp_kdp_wait(boolean_t flush, boolean_t isNMI);
112#endif /* MACH_KDP */
113static void	mp_rendezvous_action(void);
114static void 	mp_broadcast_action(void);
115
116#if MACH_KDP
117static boolean_t	cpu_signal_pending(int cpu, mp_event_t event);
118#endif /* MACH_KDP */
119static int		NMIInterruptHandler(x86_saved_state_t *regs);
120
121boolean_t 		smp_initialized = FALSE;
122uint32_t 		TSC_sync_margin = 0xFFF;
123volatile boolean_t	force_immediate_debugger_NMI = FALSE;
124volatile boolean_t	pmap_tlb_flush_timeout = FALSE;
125decl_simple_lock_data(,mp_kdp_lock);
126
127decl_lck_mtx_data(static, mp_cpu_boot_lock);
128lck_mtx_ext_t	mp_cpu_boot_lock_ext;
129
130/* Variables needed for MP rendezvous. */
131decl_simple_lock_data(,mp_rv_lock);
132static void	(*mp_rv_setup_func)(void *arg);
133static void	(*mp_rv_action_func)(void *arg);
134static void	(*mp_rv_teardown_func)(void *arg);
135static void	*mp_rv_func_arg;
136static volatile int	mp_rv_ncpus;
137			/* Cache-aligned barriers: */
138static volatile long	mp_rv_entry    __attribute__((aligned(64)));
139static volatile long	mp_rv_exit     __attribute__((aligned(64)));
140static volatile long	mp_rv_complete __attribute__((aligned(64)));
141
142volatile	uint64_t	debugger_entry_time;
143volatile	uint64_t	debugger_exit_time;
144#if MACH_KDP
145#include <kdp/kdp.h>
146extern int kdp_snapshot;
147static struct _kdp_xcpu_call_func {
148	kdp_x86_xcpu_func_t func;
149	void     *arg0, *arg1;
150	volatile long     ret;
151	volatile uint16_t cpu;
152} kdp_xcpu_call_func = {
153	.cpu  = KDP_XCPU_NONE
154};
155
156#endif
157
158/* Variables needed for MP broadcast. */
159static void        (*mp_bc_action_func)(void *arg);
160static void        *mp_bc_func_arg;
161static int     	mp_bc_ncpus;
162static volatile long   mp_bc_count;
163decl_lck_mtx_data(static, mp_bc_lock);
164lck_mtx_ext_t	mp_bc_lock_ext;
165static	volatile int 	debugger_cpu = -1;
166volatile long	 NMIPI_acks = 0;
167volatile long	 NMI_count = 0;
168
169extern void	NMI_cpus(void);
170
171static void	mp_cpus_call_init(void);
172static void	mp_cpus_call_action(void);
173static void	mp_call_PM(void);
174
175static boolean_t	mp_cpus_call_wait_timeout = FALSE;
176
177char		mp_slave_stack[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); // Temp stack for slave init
178
179/* PAL-related routines */
180boolean_t i386_smp_init(int nmi_vector, i386_intr_func_t nmi_handler,
181		int ipi_vector, i386_intr_func_t ipi_handler);
182void i386_start_cpu(int lapic_id, int cpu_num);
183void i386_send_NMI(int cpu);
184
185#if GPROF
186/*
187 * Initialize dummy structs for profiling. These aren't used but
188 * allows hertz_tick() to be built with GPROF defined.
189 */
190struct profile_vars _profile_vars;
191struct profile_vars *_profile_vars_cpus[MAX_CPUS] = { &_profile_vars };
192#define GPROF_INIT()							\
193{									\
194	int	i;							\
195									\
196	/* Hack to initialize pointers to unused profiling structs */	\
197	for (i = 1; i < MAX_CPUS; i++)				\
198		_profile_vars_cpus[i] = &_profile_vars;			\
199}
200#else
201#define GPROF_INIT()
202#endif /* GPROF */
203
204static lck_grp_t 	smp_lck_grp;
205static lck_grp_attr_t	smp_lck_grp_attr;
206
207#define NUM_CPU_WARM_CALLS	20
208struct timer_call	cpu_warm_call_arr[NUM_CPU_WARM_CALLS];
209queue_head_t 		cpu_warm_call_list;
210decl_simple_lock_data(static, cpu_warm_lock);
211
212typedef struct cpu_warm_data {
213	timer_call_t 	cwd_call;
214	uint64_t	cwd_deadline;
215	int		cwd_result;
216} *cpu_warm_data_t;
217
218static void		cpu_prewarm_init(void);
219static void 		cpu_warm_timer_call_func(call_entry_param_t p0, call_entry_param_t p1);
220static void 		_cpu_warm_setup(void *arg);
221static timer_call_t 	grab_warm_timer_call(void);
222static void		free_warm_timer_call(timer_call_t call);
223
224void
225smp_init(void)
226{
227	simple_lock_init(&mp_kdp_lock, 0);
228	simple_lock_init(&mp_rv_lock, 0);
229	lck_grp_attr_setdefault(&smp_lck_grp_attr);
230	lck_grp_init(&smp_lck_grp, "i386_smp", &smp_lck_grp_attr);
231	lck_mtx_init_ext(&mp_cpu_boot_lock, &mp_cpu_boot_lock_ext, &smp_lck_grp, LCK_ATTR_NULL);
232	lck_mtx_init_ext(&mp_bc_lock, &mp_bc_lock_ext, &smp_lck_grp, LCK_ATTR_NULL);
233	console_init();
234
235	if(!i386_smp_init(LAPIC_NMI_INTERRUPT, NMIInterruptHandler,
236				LAPIC_VECTOR(INTERPROCESSOR), cpu_signal_handler))
237		return;
238
239	cpu_thread_init();
240
241	GPROF_INIT();
242	DBGLOG_CPU_INIT(master_cpu);
243
244	mp_cpus_call_init();
245	mp_cpus_call_cpu_init(master_cpu);
246
247	if (PE_parse_boot_argn("TSC_sync_margin",
248					&TSC_sync_margin, sizeof(TSC_sync_margin))) {
249		kprintf("TSC sync Margin 0x%x\n", TSC_sync_margin);
250	} else if (cpuid_vmm_present()) {
251		kprintf("TSC sync margin disabled\n");
252		TSC_sync_margin = 0;
253	}
254	smp_initialized = TRUE;
255
256	cpu_prewarm_init();
257
258	return;
259}
260
261typedef struct {
262	int			target_cpu;
263	int			target_lapic;
264	int			starter_cpu;
265} processor_start_info_t;
266static processor_start_info_t	start_info	  __attribute__((aligned(64)));
267
268/*
269 * Cache-alignment is to avoid cross-cpu false-sharing interference.
270 */
271static volatile long		tsc_entry_barrier __attribute__((aligned(64)));
272static volatile long		tsc_exit_barrier  __attribute__((aligned(64)));
273static volatile uint64_t	tsc_target	  __attribute__((aligned(64)));
274
275/*
276 * Poll a CPU to see when it has marked itself as running.
277 */
278static void
279mp_wait_for_cpu_up(int slot_num, unsigned int iters, unsigned int usecdelay)
280{
281	while (iters-- > 0) {
282		if (cpu_datap(slot_num)->cpu_running)
283			break;
284		delay(usecdelay);
285	}
286}
287
288/*
289 * Quickly bring a CPU back online which has been halted.
290 */
291kern_return_t
292intel_startCPU_fast(int slot_num)
293{
294	kern_return_t	rc;
295
296	/*
297	 * Try to perform a fast restart
298	 */
299	rc = pmCPUExitHalt(slot_num);
300	if (rc != KERN_SUCCESS)
301		/*
302		 * The CPU was not eligible for a fast restart.
303		 */
304		return(rc);
305
306	KERNEL_DEBUG_CONSTANT(
307		TRACE_MP_CPU_FAST_START | DBG_FUNC_START,
308		slot_num, 0, 0, 0, 0);
309
310	/*
311	 * Wait until the CPU is back online.
312	 */
313	mp_disable_preemption();
314
315	/*
316	 * We use short pauses (1us) for low latency.  30,000 iterations is
317	 * longer than a full restart would require so it should be more
318	 * than long enough.
319	 */
320
321	mp_wait_for_cpu_up(slot_num, 30000, 1);
322	mp_enable_preemption();
323
324	KERNEL_DEBUG_CONSTANT(
325		TRACE_MP_CPU_FAST_START | DBG_FUNC_END,
326		slot_num, cpu_datap(slot_num)->cpu_running, 0, 0, 0);
327
328	/*
329	 * Check to make sure that the CPU is really running.  If not,
330	 * go through the slow path.
331	 */
332	if (cpu_datap(slot_num)->cpu_running)
333		return(KERN_SUCCESS);
334	else
335		return(KERN_FAILURE);
336}
337
338static void
339started_cpu(void)
340{
341	/* Here on the started cpu with cpu_running set TRUE */
342
343	if (TSC_sync_margin &&
344	    start_info.target_cpu == cpu_number()) {
345		/*
346		 * I've just started-up, synchronize again with the starter cpu
347		 * and then snap my TSC.
348		 */
349		tsc_target   = 0;
350		atomic_decl(&tsc_entry_barrier, 1);
351		while (tsc_entry_barrier != 0)
352			;	/* spin for starter and target at barrier */
353		tsc_target = rdtsc64();
354		atomic_decl(&tsc_exit_barrier, 1);
355	}
356}
357
358static void
359start_cpu(void *arg)
360{
361	int			i = 1000;
362	processor_start_info_t	*psip = (processor_start_info_t *) arg;
363
364	/* Ignore this if the current processor is not the starter */
365	if (cpu_number() != psip->starter_cpu)
366		return;
367
368	DBG("start_cpu(%p) about to start cpu %d, lapic %d\n",
369		arg, psip->target_cpu, psip->target_lapic);
370
371	KERNEL_DEBUG_CONSTANT(
372		TRACE_MP_CPU_START | DBG_FUNC_START,
373		psip->target_cpu,
374		psip->target_lapic, 0, 0, 0);
375
376	i386_start_cpu(psip->target_lapic, psip->target_cpu);
377
378#ifdef	POSTCODE_DELAY
379	/* Wait much longer if postcodes are displayed for a delay period. */
380	i *= 10000;
381#endif
382	DBG("start_cpu(%p) about to wait for cpu %d\n",
383		arg, psip->target_cpu);
384
385	mp_wait_for_cpu_up(psip->target_cpu, i*100, 100);
386
387	KERNEL_DEBUG_CONSTANT(
388		TRACE_MP_CPU_START | DBG_FUNC_END,
389		psip->target_cpu,
390		cpu_datap(psip->target_cpu)->cpu_running, 0, 0, 0);
391
392	if (TSC_sync_margin &&
393	    cpu_datap(psip->target_cpu)->cpu_running) {
394		/*
395		 * Compare the TSC from the started processor with ours.
396		 * Report and log/panic if it diverges by more than
397		 * TSC_sync_margin (TSC_SYNC_MARGIN) ticks. This margin
398		 * can be overriden by boot-arg (with 0 meaning no checking).
399		 */
400		uint64_t	tsc_starter;
401		int64_t		tsc_delta;
402		atomic_decl(&tsc_entry_barrier, 1);
403		while (tsc_entry_barrier != 0)
404			;	/* spin for both processors at barrier */
405		tsc_starter = rdtsc64();
406		atomic_decl(&tsc_exit_barrier, 1);
407		while (tsc_exit_barrier != 0)
408			;	/* spin for target to store its TSC */
409		tsc_delta = tsc_target - tsc_starter;
410		kprintf("TSC sync for cpu %d: 0x%016llx delta 0x%llx (%lld)\n",
411			psip->target_cpu, tsc_target, tsc_delta, tsc_delta);
412		if (ABS(tsc_delta) > (int64_t) TSC_sync_margin) {
413#if DEBUG
414			panic(
415#else
416			printf(
417#endif
418				"Unsynchronized  TSC for cpu %d: "
419					"0x%016llx, delta 0x%llx\n",
420				psip->target_cpu, tsc_target, tsc_delta);
421		}
422	}
423}
424
425kern_return_t
426intel_startCPU(
427	int	slot_num)
428{
429	int		lapic = cpu_to_lapic[slot_num];
430	boolean_t	istate;
431
432	assert(lapic != -1);
433
434	DBGLOG_CPU_INIT(slot_num);
435
436	DBG("intel_startCPU(%d) lapic_id=%d\n", slot_num, lapic);
437	DBG("IdlePTD(%p): 0x%x\n", &IdlePTD, (int) (uintptr_t)IdlePTD);
438
439	/*
440	 * Initialize (or re-initialize) the descriptor tables for this cpu.
441	 * Propagate processor mode to slave.
442	 */
443	cpu_desc_init64(cpu_datap(slot_num));
444
445	/* Serialize use of the slave boot stack, etc. */
446	lck_mtx_lock(&mp_cpu_boot_lock);
447
448	istate = ml_set_interrupts_enabled(FALSE);
449	if (slot_num == get_cpu_number()) {
450		ml_set_interrupts_enabled(istate);
451		lck_mtx_unlock(&mp_cpu_boot_lock);
452		return KERN_SUCCESS;
453	}
454
455	start_info.starter_cpu  = cpu_number();
456	start_info.target_cpu   = slot_num;
457	start_info.target_lapic = lapic;
458	tsc_entry_barrier = 2;
459	tsc_exit_barrier = 2;
460
461	/*
462	 * Perform the processor startup sequence with all running
463	 * processors rendezvous'ed. This is required during periods when
464	 * the cache-disable bit is set for MTRR/PAT initialization.
465	 */
466	mp_rendezvous_no_intrs(start_cpu, (void *) &start_info);
467
468	start_info.target_cpu = 0;
469
470	ml_set_interrupts_enabled(istate);
471	lck_mtx_unlock(&mp_cpu_boot_lock);
472
473	if (!cpu_datap(slot_num)->cpu_running) {
474		kprintf("Failed to start CPU %02d\n", slot_num);
475		printf("Failed to start CPU %02d, rebooting...\n", slot_num);
476		delay(1000000);
477		halt_cpu();
478		return KERN_SUCCESS;
479	} else {
480		kprintf("Started cpu %d (lapic id %08x)\n", slot_num, lapic);
481		return KERN_SUCCESS;
482	}
483}
484
485#if	MP_DEBUG
486cpu_signal_event_log_t	*cpu_signal[MAX_CPUS];
487cpu_signal_event_log_t	*cpu_handle[MAX_CPUS];
488
489MP_EVENT_NAME_DECL();
490
491#endif	/* MP_DEBUG */
492
493/*
494 * Note: called with NULL state when polling for TLB flush and cross-calls.
495 */
496int
497cpu_signal_handler(x86_saved_state_t *regs)
498{
499#if	!MACH_KDP
500#pragma unused (regs)
501#endif /* !MACH_KDP */
502	int		my_cpu;
503	volatile int	*my_word;
504
505	SCHED_STATS_IPI(current_processor());
506
507	my_cpu = cpu_number();
508	my_word = &cpu_data_ptr[my_cpu]->cpu_signals;
509	/* Store the initial set of signals for diagnostics. New
510	 * signals could arrive while these are being processed
511	 * so it's no more than a hint.
512	 */
513
514	cpu_data_ptr[my_cpu]->cpu_prior_signals = *my_word;
515
516	do {
517#if	MACH_KDP
518		if (i_bit(MP_KDP, my_word)) {
519			DBGLOG(cpu_handle,my_cpu,MP_KDP);
520			i_bit_clear(MP_KDP, my_word);
521/* Ensure that the i386_kernel_state at the base of the
522 * current thread's stack (if any) is synchronized with the
523 * context at the moment of the interrupt, to facilitate
524 * access through the debugger.
525 */
526			sync_iss_to_iks(regs);
527			if (pmsafe_debug && !kdp_snapshot)
528				pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
529			mp_kdp_wait(TRUE, FALSE);
530			if (pmsafe_debug && !kdp_snapshot)
531				pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
532		} else
533#endif	/* MACH_KDP */
534		if (i_bit(MP_TLB_FLUSH, my_word)) {
535			DBGLOG(cpu_handle,my_cpu,MP_TLB_FLUSH);
536			i_bit_clear(MP_TLB_FLUSH, my_word);
537			pmap_update_interrupt();
538		} else if (i_bit(MP_RENDEZVOUS, my_word)) {
539			DBGLOG(cpu_handle,my_cpu,MP_RENDEZVOUS);
540			i_bit_clear(MP_RENDEZVOUS, my_word);
541			mp_rendezvous_action();
542		} else if (i_bit(MP_BROADCAST, my_word)) {
543			DBGLOG(cpu_handle,my_cpu,MP_BROADCAST);
544			i_bit_clear(MP_BROADCAST, my_word);
545			mp_broadcast_action();
546		} else if (i_bit(MP_CHUD, my_word)) {
547			DBGLOG(cpu_handle,my_cpu,MP_CHUD);
548			i_bit_clear(MP_CHUD, my_word);
549			chudxnu_cpu_signal_handler();
550		} else if (i_bit(MP_CALL, my_word)) {
551			DBGLOG(cpu_handle,my_cpu,MP_CALL);
552			i_bit_clear(MP_CALL, my_word);
553			mp_cpus_call_action();
554		} else if (i_bit(MP_CALL_PM, my_word)) {
555			DBGLOG(cpu_handle,my_cpu,MP_CALL_PM);
556			i_bit_clear(MP_CALL_PM, my_word);
557			mp_call_PM();
558		}
559		if (regs == NULL) {
560			/* Called to poll only for cross-calls and TLB flush */
561			break;
562		} else if (i_bit(MP_AST, my_word)) {
563			DBGLOG(cpu_handle,my_cpu,MP_AST);
564			i_bit_clear(MP_AST, my_word);
565			ast_check(cpu_to_processor(my_cpu));
566		}
567	} while (*my_word);
568
569	return 0;
570}
571
572extern void kprintf_break_lock(void);
573static int
574NMIInterruptHandler(x86_saved_state_t *regs)
575{
576	void 		*stackptr;
577
578	if (panic_active() && !panicDebugging) {
579		if (pmsafe_debug)
580			pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
581		for(;;)
582			cpu_pause();
583	}
584
585	atomic_incl(&NMIPI_acks, 1);
586	atomic_incl(&NMI_count, 1);
587	sync_iss_to_iks_unconditionally(regs);
588	__asm__ volatile("movq %%rbp, %0" : "=m" (stackptr));
589
590	if (cpu_number() == debugger_cpu)
591		goto NMExit;
592
593	if (spinlock_timed_out) {
594		char pstr[192];
595		snprintf(&pstr[0], sizeof(pstr), "Panic(CPU %d): NMIPI for spinlock acquisition timeout, spinlock: %p, spinlock owner: %p, current_thread: %p, spinlock_owner_cpu: 0x%x\n", cpu_number(), spinlock_timed_out, (void *) spinlock_timed_out->interlock.lock_data, current_thread(), spinlock_owner_cpu);
596		panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
597	} else if (mp_cpus_call_wait_timeout) {
598		char pstr[192];
599		snprintf(&pstr[0], sizeof(pstr), "Panic(CPU %d): Unresponsive processor, this CPU timed-out during cross-call\n", cpu_number());
600		panic_i386_backtrace(stackptr, 64, &pstr[0], TRUE, regs);
601	} else if (pmap_tlb_flush_timeout == TRUE) {
602		char pstr[128];
603		snprintf(&pstr[0], sizeof(pstr), "Panic(CPU %d): Unresponsive processor (this CPU did not acknowledge interrupts) TLB state:0x%x\n", cpu_number(), current_cpu_datap()->cpu_tlb_invalid);
604		panic_i386_backtrace(stackptr, 48, &pstr[0], TRUE, regs);
605	}
606
607#if MACH_KDP
608	if (pmsafe_debug && !kdp_snapshot)
609		pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
610	current_cpu_datap()->cpu_NMI_acknowledged = TRUE;
611	i_bit_clear(MP_KDP, &current_cpu_datap()->cpu_signals);
612	if (pmap_tlb_flush_timeout ||
613	    spinlock_timed_out ||
614	    mp_cpus_call_wait_timeout ||
615	    panic_active()) {
616		mp_kdp_wait(FALSE, TRUE);
617	} else if (virtualized && (debug_boot_arg & DB_NMI)) {
618		/*
619		 * Under a VMM with the debug boot-arg set, drop into kdp.
620		 * Since an NMI is involved, there's a risk of contending with
621		 * a panic. And side-effects of NMIs may result in entry into,
622		 * and continuing from, the debugger being unreliable.
623		 */
624		kprintf_break_lock();
625		kprintf("Debugger entry requested by NMI\n");
626		kdp_i386_trap(T_DEBUG, saved_state64(regs), 0, 0);
627		printf("Debugger entry requested by NMI\n");
628	} else {
629		mp_kdp_wait(FALSE, FALSE);
630	}
631	if (pmsafe_debug && !kdp_snapshot)
632		pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
633#endif
634NMExit:
635	return 1;
636}
637
638
639/*
640 * cpu_interrupt is really just to be used by the scheduler to
641 * get a CPU's attention it may not always issue an IPI.  If an
642 * IPI is always needed then use i386_cpu_IPI.
643 */
644void
645cpu_interrupt(int cpu)
646{
647	boolean_t did_IPI = FALSE;
648
649	if (smp_initialized
650	    && pmCPUExitIdle(cpu_datap(cpu))) {
651		i386_cpu_IPI(cpu);
652		did_IPI = TRUE;
653	}
654
655	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, did_IPI, 0, 0, 0);
656}
657
658/*
659 * Send a true NMI via the local APIC to the specified CPU.
660 */
661void
662cpu_NMI_interrupt(int cpu)
663{
664	if (smp_initialized) {
665		i386_send_NMI(cpu);
666	}
667}
668
669void
670NMI_cpus(void)
671{
672	unsigned int	cpu;
673	boolean_t	intrs_enabled;
674	uint64_t	tsc_timeout;
675
676	intrs_enabled = ml_set_interrupts_enabled(FALSE);
677
678	for (cpu = 0; cpu < real_ncpus; cpu++) {
679		if (!cpu_datap(cpu)->cpu_running)
680			continue;
681		cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
682		cpu_NMI_interrupt(cpu);
683		tsc_timeout = !machine_timeout_suspended() ?
684				rdtsc64() + (1000 * 1000 * 1000 * 10ULL) :
685				~0ULL;
686		while (!cpu_datap(cpu)->cpu_NMI_acknowledged) {
687			handle_pending_TLB_flushes();
688			cpu_pause();
689			if (rdtsc64() > tsc_timeout)
690				panic("NMI_cpus() timeout cpu %d", cpu);
691		}
692		cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
693	}
694
695	ml_set_interrupts_enabled(intrs_enabled);
696}
697
698static void	(* volatile mp_PM_func)(void) = NULL;
699
700static void
701mp_call_PM(void)
702{
703	assert(!ml_get_interrupts_enabled());
704
705	if (mp_PM_func != NULL)
706		mp_PM_func();
707}
708
709void
710cpu_PM_interrupt(int cpu)
711{
712	assert(!ml_get_interrupts_enabled());
713
714	if (mp_PM_func != NULL) {
715		if (cpu == cpu_number())
716			mp_PM_func();
717		else
718			i386_signal_cpu(cpu, MP_CALL_PM, ASYNC);
719	}
720}
721
722void
723PM_interrupt_register(void (*fn)(void))
724{
725	mp_PM_func = fn;
726}
727
728void
729i386_signal_cpu(int cpu, mp_event_t event, mp_sync_t mode)
730{
731	volatile int	*signals = &cpu_datap(cpu)->cpu_signals;
732	uint64_t	tsc_timeout;
733
734
735	if (!cpu_datap(cpu)->cpu_running)
736		return;
737
738	if (event == MP_TLB_FLUSH)
739	        KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_START, cpu, 0, 0, 0, 0);
740
741	DBGLOG(cpu_signal, cpu, event);
742
743	i_bit_set(event, signals);
744	i386_cpu_IPI(cpu);
745	if (mode == SYNC) {
746	   again:
747		tsc_timeout = !machine_timeout_suspended() ?
748					rdtsc64() + (1000*1000*1000) :
749					~0ULL;
750		while (i_bit(event, signals) && rdtsc64() < tsc_timeout) {
751			cpu_pause();
752		}
753		if (i_bit(event, signals)) {
754			DBG("i386_signal_cpu(%d, 0x%x, SYNC) timed out\n",
755				cpu, event);
756			goto again;
757		}
758	}
759	if (event == MP_TLB_FLUSH)
760	        KERNEL_DEBUG(TRACE_MP_TLB_FLUSH | DBG_FUNC_END, cpu, 0, 0, 0, 0);
761}
762
763/*
764 * Send event to all running cpus.
765 * Called with the topology locked.
766 */
767void
768i386_signal_cpus(mp_event_t event, mp_sync_t mode)
769{
770	unsigned int	cpu;
771	unsigned int	my_cpu = cpu_number();
772
773	assert(hw_lock_held((hw_lock_t)&x86_topo_lock));
774
775	for (cpu = 0; cpu < real_ncpus; cpu++) {
776		if (cpu == my_cpu || !cpu_datap(cpu)->cpu_running)
777			continue;
778		i386_signal_cpu(cpu, event, mode);
779	}
780}
781
782/*
783 * Return the number of running cpus.
784 * Called with the topology locked.
785 */
786int
787i386_active_cpus(void)
788{
789	unsigned int	cpu;
790	unsigned int	ncpus = 0;
791
792	assert(hw_lock_held((hw_lock_t)&x86_topo_lock));
793
794	for (cpu = 0; cpu < real_ncpus; cpu++) {
795		if (cpu_datap(cpu)->cpu_running)
796			ncpus++;
797	}
798	return(ncpus);
799}
800
801/*
802 * Helper function called when busy-waiting: panic if too long
803 * a TSC-based time has elapsed since the start of the spin.
804 */
805static boolean_t
806mp_spin_timeout(uint64_t tsc_start)
807{
808	uint64_t	tsc_timeout;
809
810	cpu_pause();
811	if (machine_timeout_suspended())
812		return FALSE;
813
814	/*
815	 * The timeout is 4 * the spinlock timeout period
816	 * unless we have serial console printing (kprintf) enabled
817	 * in which case we allow an even greater margin.
818	 */
819	tsc_timeout = disable_serial_output ? (uint64_t) LockTimeOutTSC << 2
820					    : (uint64_t) LockTimeOutTSC << 4;
821	return  (rdtsc64() > tsc_start + tsc_timeout);
822}
823
824/*
825 * Helper function to take a spinlock while ensuring that incoming IPIs
826 * are still serviced if interrupts are masked while we spin.
827 */
828static boolean_t
829mp_safe_spin_lock(usimple_lock_t lock)
830{
831	if (ml_get_interrupts_enabled()) {
832		simple_lock(lock);
833		return TRUE;
834	} else {
835		uint64_t tsc_spin_start = rdtsc64();
836		while (!simple_lock_try(lock)) {
837			cpu_signal_handler(NULL);
838			if (mp_spin_timeout(tsc_spin_start)) {
839				uint32_t lock_cpu;
840				uintptr_t lowner = (uintptr_t)
841						   lock->interlock.lock_data;
842				spinlock_timed_out = lock;
843				lock_cpu = spinlock_timeout_NMI(lowner);
844				panic("mp_safe_spin_lock() timed out,"
845				      " lock: %p, owner thread: 0x%lx,"
846				      " current_thread: %p, owner on CPU 0x%x",
847				      lock, lowner,
848				      current_thread(), lock_cpu);
849			}
850		}
851		return FALSE;
852	}
853}
854
855/*
856 * All-CPU rendezvous:
857 * 	- CPUs are signalled,
858 *	- all execute the setup function (if specified),
859 *	- rendezvous (i.e. all cpus reach a barrier),
860 *	- all execute the action function (if specified),
861 *	- rendezvous again,
862 *	- execute the teardown function (if specified), and then
863 *	- resume.
864 *
865 * Note that the supplied external functions _must_ be reentrant and aware
866 * that they are running in parallel and in an unknown lock context.
867 */
868
869static void
870mp_rendezvous_action(void)
871{
872	boolean_t	intrs_enabled;
873	uint64_t	tsc_spin_start;
874
875	/* setup function */
876	if (mp_rv_setup_func != NULL)
877		mp_rv_setup_func(mp_rv_func_arg);
878
879	intrs_enabled = ml_get_interrupts_enabled();
880
881	/* spin on entry rendezvous */
882	atomic_incl(&mp_rv_entry, 1);
883	tsc_spin_start = rdtsc64();
884	while (mp_rv_entry < mp_rv_ncpus) {
885		/* poll for pesky tlb flushes if interrupts disabled */
886		if (!intrs_enabled)
887			handle_pending_TLB_flushes();
888		if (mp_spin_timeout(tsc_spin_start))
889			panic("mp_rendezvous_action() entry");
890	}
891
892	/* action function */
893	if (mp_rv_action_func != NULL)
894		mp_rv_action_func(mp_rv_func_arg);
895
896	/* spin on exit rendezvous */
897	atomic_incl(&mp_rv_exit, 1);
898	tsc_spin_start = rdtsc64();
899	while (mp_rv_exit < mp_rv_ncpus) {
900		if (!intrs_enabled)
901			handle_pending_TLB_flushes();
902		if (mp_spin_timeout(tsc_spin_start))
903			panic("mp_rendezvous_action() exit");
904	}
905
906	/* teardown function */
907	if (mp_rv_teardown_func != NULL)
908		mp_rv_teardown_func(mp_rv_func_arg);
909
910	/* Bump completion count */
911	atomic_incl(&mp_rv_complete, 1);
912}
913
914void
915mp_rendezvous(void (*setup_func)(void *),
916	      void (*action_func)(void *),
917	      void (*teardown_func)(void *),
918	      void *arg)
919{
920	uint64_t	tsc_spin_start;
921
922	if (!smp_initialized) {
923		if (setup_func != NULL)
924			setup_func(arg);
925		if (action_func != NULL)
926			action_func(arg);
927		if (teardown_func != NULL)
928			teardown_func(arg);
929		return;
930	}
931
932	/* obtain rendezvous lock */
933	(void) mp_safe_spin_lock(&mp_rv_lock);
934
935	/* set static function pointers */
936	mp_rv_setup_func = setup_func;
937	mp_rv_action_func = action_func;
938	mp_rv_teardown_func = teardown_func;
939	mp_rv_func_arg = arg;
940
941	mp_rv_entry    = 0;
942	mp_rv_exit     = 0;
943	mp_rv_complete = 0;
944
945	/*
946	 * signal other processors, which will call mp_rendezvous_action()
947	 * with interrupts disabled
948	 */
949	(void) mp_safe_spin_lock(&x86_topo_lock);
950	mp_rv_ncpus = i386_active_cpus();
951	i386_signal_cpus(MP_RENDEZVOUS, ASYNC);
952	simple_unlock(&x86_topo_lock);
953
954	/* call executor function on this cpu */
955	mp_rendezvous_action();
956
957	/*
958	 * Spin for everyone to complete.
959	 * This is necessary to ensure that all processors have proceeded
960	 * from the exit barrier before we release the rendezvous structure.
961	 */
962	tsc_spin_start = rdtsc64();
963	while (mp_rv_complete < mp_rv_ncpus) {
964		if (mp_spin_timeout(tsc_spin_start))
965			panic("mp_rendezvous() timeout");
966	}
967
968	/* Tidy up */
969	mp_rv_setup_func = NULL;
970	mp_rv_action_func = NULL;
971	mp_rv_teardown_func = NULL;
972	mp_rv_func_arg = NULL;
973
974	/* release lock */
975	simple_unlock(&mp_rv_lock);
976}
977
978void
979mp_rendezvous_break_lock(void)
980{
981	simple_lock_init(&mp_rv_lock, 0);
982}
983
984static void
985setup_disable_intrs(__unused void * param_not_used)
986{
987	/* disable interrupts before the first barrier */
988	boolean_t intr = ml_set_interrupts_enabled(FALSE);
989
990	current_cpu_datap()->cpu_iflag = intr;
991	DBG("CPU%d: %s\n", get_cpu_number(), __FUNCTION__);
992}
993
994static void
995teardown_restore_intrs(__unused void * param_not_used)
996{
997	/* restore interrupt flag following MTRR changes */
998	ml_set_interrupts_enabled(current_cpu_datap()->cpu_iflag);
999	DBG("CPU%d: %s\n", get_cpu_number(), __FUNCTION__);
1000}
1001
1002/*
1003 * A wrapper to mp_rendezvous() to call action_func() with interrupts disabled.
1004 * This is exported for use by kexts.
1005 */
1006void
1007mp_rendezvous_no_intrs(
1008	      void (*action_func)(void *),
1009	      void *arg)
1010{
1011	mp_rendezvous(setup_disable_intrs,
1012		      action_func,
1013		      teardown_restore_intrs,
1014		      arg);
1015}
1016
1017
1018typedef struct {
1019	queue_chain_t	link;			/* queue linkage */
1020	void		(*func)(void *,void *);	/* routine to call */
1021	void		*arg0;			/* routine's 1st arg */
1022	void		*arg1;			/* routine's 2nd arg */
1023	cpumask_t	*maskp;			/* completion response mask */
1024} mp_call_t;
1025
1026
1027typedef struct {
1028	queue_head_t		queue;
1029	decl_simple_lock_data(,	lock);
1030} mp_call_queue_t;
1031#define MP_CPUS_CALL_BUFS_PER_CPU	MAX_CPUS
1032static mp_call_queue_t	mp_cpus_call_freelist;
1033static mp_call_queue_t	mp_cpus_call_head[MAX_CPUS];
1034
1035static inline boolean_t
1036mp_call_head_lock(mp_call_queue_t *cqp)
1037{
1038	boolean_t	intrs_enabled;
1039
1040	intrs_enabled = ml_set_interrupts_enabled(FALSE);
1041	simple_lock(&cqp->lock);
1042
1043	return intrs_enabled;
1044}
1045
1046void
1047mp_cpus_NMIPI(cpumask_t cpu_mask) {
1048	unsigned int cpu, cpu_bit;
1049	uint64_t deadline;
1050
1051	for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
1052		if (cpu_mask & cpu_bit)
1053			cpu_NMI_interrupt(cpu);
1054	}
1055	deadline = mach_absolute_time() + (LockTimeOut);
1056	while (mach_absolute_time() < deadline)
1057		cpu_pause();
1058}
1059
1060#if MACH_ASSERT
1061static inline boolean_t
1062mp_call_head_is_locked(mp_call_queue_t *cqp)
1063{
1064	return !ml_get_interrupts_enabled() &&
1065		hw_lock_held((hw_lock_t)&cqp->lock);
1066}
1067#endif
1068
1069static inline void
1070mp_call_head_unlock(mp_call_queue_t *cqp, boolean_t intrs_enabled)
1071{
1072	simple_unlock(&cqp->lock);
1073	ml_set_interrupts_enabled(intrs_enabled);
1074}
1075
1076static inline mp_call_t *
1077mp_call_alloc(void)
1078{
1079	mp_call_t	*callp = NULL;
1080	boolean_t	intrs_enabled;
1081	mp_call_queue_t	*cqp = &mp_cpus_call_freelist;
1082
1083	intrs_enabled = mp_call_head_lock(cqp);
1084	if (!queue_empty(&cqp->queue))
1085		queue_remove_first(&cqp->queue, callp, typeof(callp), link);
1086	mp_call_head_unlock(cqp, intrs_enabled);
1087
1088	return callp;
1089}
1090
1091static inline void
1092mp_call_free(mp_call_t *callp)
1093{
1094	boolean_t	intrs_enabled;
1095	mp_call_queue_t	*cqp = &mp_cpus_call_freelist;
1096
1097	intrs_enabled = mp_call_head_lock(cqp);
1098	queue_enter_first(&cqp->queue, callp, typeof(callp), link);
1099	mp_call_head_unlock(cqp, intrs_enabled);
1100}
1101
1102static inline mp_call_t *
1103mp_call_dequeue_locked(mp_call_queue_t *cqp)
1104{
1105	mp_call_t	*callp = NULL;
1106
1107	assert(mp_call_head_is_locked(cqp));
1108	if (!queue_empty(&cqp->queue))
1109		queue_remove_first(&cqp->queue, callp, typeof(callp), link);
1110	return callp;
1111}
1112
1113static inline void
1114mp_call_enqueue_locked(
1115	mp_call_queue_t	*cqp,
1116	mp_call_t	*callp)
1117{
1118	queue_enter(&cqp->queue, callp, typeof(callp), link);
1119}
1120
1121/* Called on the boot processor to initialize global structures */
1122static void
1123mp_cpus_call_init(void)
1124{
1125	mp_call_queue_t	*cqp = &mp_cpus_call_freelist;
1126
1127	DBG("mp_cpus_call_init()\n");
1128	simple_lock_init(&cqp->lock, 0);
1129	queue_init(&cqp->queue);
1130}
1131
1132/*
1133 * Called at processor registration to add call buffers to the free list
1134 * and to initialize the per-cpu call queue.
1135 */
1136void
1137mp_cpus_call_cpu_init(int cpu)
1138{
1139	int		i;
1140	mp_call_queue_t	*cqp = &mp_cpus_call_head[cpu];
1141	mp_call_t	*callp;
1142
1143	simple_lock_init(&cqp->lock, 0);
1144	queue_init(&cqp->queue);
1145	for (i = 0; i < MP_CPUS_CALL_BUFS_PER_CPU; i++) {
1146		callp = (mp_call_t *) kalloc(sizeof(mp_call_t));
1147		mp_call_free(callp);
1148	}
1149
1150	DBG("mp_cpus_call_init(%d) done\n", cpu);
1151}
1152
1153/*
1154 * This is called from cpu_signal_handler() to process an MP_CALL signal.
1155 * And also from i386_deactivate_cpu() when a cpu is being taken offline.
1156 */
1157static void
1158mp_cpus_call_action(void)
1159{
1160	mp_call_queue_t	*cqp;
1161	boolean_t	intrs_enabled;
1162	mp_call_t	*callp;
1163	mp_call_t	call;
1164
1165	assert(!ml_get_interrupts_enabled());
1166	cqp = &mp_cpus_call_head[cpu_number()];
1167	intrs_enabled = mp_call_head_lock(cqp);
1168	while ((callp = mp_call_dequeue_locked(cqp)) != NULL) {
1169		/* Copy call request to the stack to free buffer */
1170		call = *callp;
1171		mp_call_free(callp);
1172		if (call.func != NULL) {
1173			mp_call_head_unlock(cqp, intrs_enabled);
1174			KERNEL_DEBUG_CONSTANT(
1175				TRACE_MP_CPUS_CALL_ACTION,
1176				call.func, call.arg0, call.arg1, call.maskp, 0);
1177			call.func(call.arg0, call.arg1);
1178			(void) mp_call_head_lock(cqp);
1179		}
1180		if (call.maskp != NULL)
1181			i_bit_set(cpu_number(), call.maskp);
1182	}
1183	mp_call_head_unlock(cqp, intrs_enabled);
1184}
1185
1186/*
1187 * mp_cpus_call() runs a given function on cpus specified in a given cpu mask.
1188 * Possible modes are:
1189 *  SYNC:   function is called serially on target cpus in logical cpu order
1190 *	    waiting for each call to be acknowledged before proceeding
1191 *  ASYNC:  function call is queued to the specified cpus
1192 *	    waiting for all calls to complete in parallel before returning
1193 *  NOSYNC: function calls are queued
1194 *	    but we return before confirmation of calls completing.
1195 * The action function may be NULL.
1196 * The cpu mask may include the local cpu. Offline cpus are ignored.
1197 * The return value is the number of cpus on which the call was made or queued.
1198 */
1199cpu_t
1200mp_cpus_call(
1201	cpumask_t	cpus,
1202	mp_sync_t	mode,
1203        void		(*action_func)(void *),
1204        void		*arg)
1205{
1206	return mp_cpus_call1(
1207			cpus,
1208			mode,
1209			(void (*)(void *,void *))action_func,
1210			arg,
1211			NULL,
1212			NULL,
1213			NULL);
1214}
1215
1216static void
1217mp_cpus_call_wait(boolean_t	intrs_enabled,
1218		  cpumask_t	cpus_called,
1219		  cpumask_t	*cpus_responded)
1220{
1221	mp_call_queue_t		*cqp;
1222	uint64_t		tsc_spin_start;
1223
1224	cqp = &mp_cpus_call_head[cpu_number()];
1225
1226	tsc_spin_start = rdtsc64();
1227	while (*cpus_responded != cpus_called) {
1228		if (!intrs_enabled) {
1229			/* Sniffing w/o locking */
1230			if (!queue_empty(&cqp->queue))
1231				mp_cpus_call_action();
1232			cpu_signal_handler(NULL);
1233		}
1234		if (mp_spin_timeout(tsc_spin_start)) {
1235			cpumask_t	cpus_unresponsive;
1236
1237			mp_cpus_call_wait_timeout = TRUE;
1238			cpus_unresponsive = cpus_called & ~(*cpus_responded);
1239			mp_cpus_NMIPI(cpus_unresponsive);
1240			panic("mp_cpus_call_wait() timeout, cpus: 0x%lx",
1241				cpus_unresponsive);
1242		}
1243	}
1244}
1245
1246cpu_t
1247mp_cpus_call1(
1248	cpumask_t	cpus,
1249	mp_sync_t	mode,
1250        void		(*action_func)(void *, void *),
1251        void		*arg0,
1252        void		*arg1,
1253	cpumask_t	*cpus_calledp,
1254	cpumask_t	*cpus_notcalledp)
1255{
1256	cpu_t		cpu;
1257	boolean_t	intrs_enabled = FALSE;
1258	boolean_t	call_self = FALSE;
1259	cpumask_t	cpus_called = 0;
1260	cpumask_t	cpus_notcalled = 0;
1261	cpumask_t	cpus_responded = 0;
1262	long 		cpus_call_count = 0;
1263	uint64_t	tsc_spin_start;
1264	boolean_t	topo_lock;
1265
1266	KERNEL_DEBUG_CONSTANT(
1267		TRACE_MP_CPUS_CALL | DBG_FUNC_START,
1268		cpus, mode, VM_KERNEL_UNSLIDE(action_func), arg0, arg1);
1269
1270	if (!smp_initialized) {
1271		if ((cpus & CPUMASK_SELF) == 0)
1272			goto out;
1273		if (action_func != NULL) {
1274			intrs_enabled = ml_set_interrupts_enabled(FALSE);
1275			action_func(arg0, arg1);
1276			ml_set_interrupts_enabled(intrs_enabled);
1277		}
1278		call_self = TRUE;
1279		goto out;
1280	}
1281
1282	/*
1283	 * Queue the call for each non-local requested cpu.
1284	 * This is performed under the topo lock to prevent changes to
1285	 * cpus online state and to prevent concurrent rendezvouses --
1286	 * although an exception is made if we're calling only the master
1287	 * processor since that always remains active. Note: this exception
1288	 * is expected for longterm timer nosync cross-calls to the master cpu.
1289	 */
1290	mp_disable_preemption();
1291	intrs_enabled = ml_get_interrupts_enabled();
1292	topo_lock = (cpus != cpu_to_cpumask(master_cpu));
1293	if (topo_lock) {
1294		ml_set_interrupts_enabled(FALSE);
1295		(void) mp_safe_spin_lock(&x86_topo_lock);
1296	}
1297	for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) {
1298		if (((cpu_to_cpumask(cpu) & cpus) == 0) ||
1299		    !cpu_datap(cpu)->cpu_running)
1300			continue;
1301		tsc_spin_start = rdtsc64();
1302		if (cpu == (cpu_t) cpu_number()) {
1303			/*
1304			 * We don't IPI ourself and if calling asynchronously,
1305			 * we defer our call until we have signalled all others.
1306			 */
1307			call_self = TRUE;
1308			if (mode == SYNC && action_func != NULL) {
1309				KERNEL_DEBUG_CONSTANT(
1310					TRACE_MP_CPUS_CALL_LOCAL,
1311					VM_KERNEL_UNSLIDE(action_func),
1312					arg0, arg1, 0, 0);
1313				action_func(arg0, arg1);
1314			}
1315		} else {
1316			/*
1317			 * Here to queue a call to cpu and IPI.
1318			 * Spinning for request buffer unless NOSYNC.
1319			 */
1320			mp_call_t	*callp = NULL;
1321			mp_call_queue_t	*cqp = &mp_cpus_call_head[cpu];
1322			boolean_t	intrs_inner;
1323
1324		queue_call:
1325			if (callp == NULL)
1326				callp = mp_call_alloc();
1327			intrs_inner = mp_call_head_lock(cqp);
1328			if (mode == NOSYNC) {
1329				if (callp == NULL) {
1330					cpus_notcalled |= cpu_to_cpumask(cpu);
1331					mp_call_head_unlock(cqp, intrs_inner);
1332					KERNEL_DEBUG_CONSTANT(
1333						TRACE_MP_CPUS_CALL_NOBUF,
1334						cpu, 0, 0, 0, 0);
1335					continue;
1336				}
1337				callp->maskp = NULL;
1338			} else {
1339				if (callp == NULL) {
1340					mp_call_head_unlock(cqp, intrs_inner);
1341					KERNEL_DEBUG_CONSTANT(
1342						TRACE_MP_CPUS_CALL_NOBUF,
1343						cpu, 0, 0, 0, 0);
1344					if (!intrs_inner) {
1345						/* Sniffing w/o locking */
1346						if (!queue_empty(&cqp->queue))
1347							mp_cpus_call_action();
1348						handle_pending_TLB_flushes();
1349					}
1350					if (mp_spin_timeout(tsc_spin_start))
1351						panic("mp_cpus_call1() timeout");
1352					goto queue_call;
1353				}
1354				callp->maskp = &cpus_responded;
1355			}
1356			callp->func = action_func;
1357			callp->arg0 = arg0;
1358			callp->arg1 = arg1;
1359			mp_call_enqueue_locked(cqp, callp);
1360			cpus_call_count++;
1361			cpus_called |= cpu_to_cpumask(cpu);
1362			i386_signal_cpu(cpu, MP_CALL, ASYNC);
1363			mp_call_head_unlock(cqp, intrs_inner);
1364			if (mode == SYNC) {
1365				mp_cpus_call_wait(intrs_inner, cpus_called, &cpus_responded);
1366			}
1367		}
1368	}
1369	if (topo_lock) {
1370		simple_unlock(&x86_topo_lock);
1371		ml_set_interrupts_enabled(intrs_enabled);
1372	}
1373
1374	/* Call locally if mode not SYNC */
1375	if (mode != SYNC && call_self ) {
1376		KERNEL_DEBUG_CONSTANT(
1377			TRACE_MP_CPUS_CALL_LOCAL,
1378			VM_KERNEL_UNSLIDE(action_func), arg0, arg1, 0, 0);
1379		if (action_func != NULL) {
1380			ml_set_interrupts_enabled(FALSE);
1381			action_func(arg0, arg1);
1382			ml_set_interrupts_enabled(intrs_enabled);
1383		}
1384	}
1385
1386	/* Safe to allow pre-emption now */
1387	mp_enable_preemption();
1388
1389	/* For ASYNC, now wait for all signaled cpus to complete their calls */
1390	if (mode == ASYNC)
1391		mp_cpus_call_wait(intrs_enabled, cpus_called, &cpus_responded);
1392
1393out:
1394	if (call_self){
1395		cpus_called |= cpu_to_cpumask(cpu);
1396		cpus_call_count++;
1397	}
1398
1399	if (cpus_calledp)
1400		*cpus_calledp = cpus_called;
1401	if (cpus_notcalledp)
1402		*cpus_notcalledp = cpus_notcalled;
1403
1404	KERNEL_DEBUG_CONSTANT(
1405		TRACE_MP_CPUS_CALL | DBG_FUNC_END,
1406		cpus_call_count, cpus_called, cpus_notcalled, 0, 0);
1407
1408	return (cpu_t) cpus_call_count;
1409}
1410
1411
1412static void
1413mp_broadcast_action(void)
1414{
1415   /* call action function */
1416   if (mp_bc_action_func != NULL)
1417       mp_bc_action_func(mp_bc_func_arg);
1418
1419   /* if we're the last one through, wake up the instigator */
1420   if (atomic_decl_and_test(&mp_bc_count, 1))
1421       thread_wakeup(((event_t)(uintptr_t) &mp_bc_count));
1422}
1423
1424/*
1425 * mp_broadcast() runs a given function on all active cpus.
1426 * The caller blocks until the functions has run on all cpus.
1427 * The caller will also block if there is another pending braodcast.
1428 */
1429void
1430mp_broadcast(
1431         void (*action_func)(void *),
1432         void *arg)
1433{
1434   if (!smp_initialized) {
1435       if (action_func != NULL)
1436	           action_func(arg);
1437       return;
1438   }
1439
1440   /* obtain broadcast lock */
1441   lck_mtx_lock(&mp_bc_lock);
1442
1443   /* set static function pointers */
1444   mp_bc_action_func = action_func;
1445   mp_bc_func_arg = arg;
1446
1447   assert_wait((event_t)(uintptr_t)&mp_bc_count, THREAD_UNINT);
1448
1449   /*
1450    * signal other processors, which will call mp_broadcast_action()
1451    */
1452   simple_lock(&x86_topo_lock);
1453   mp_bc_ncpus = i386_active_cpus();   /* total including this cpu */
1454   mp_bc_count = mp_bc_ncpus;
1455   i386_signal_cpus(MP_BROADCAST, ASYNC);
1456
1457   /* call executor function on this cpu */
1458   mp_broadcast_action();
1459   simple_unlock(&x86_topo_lock);
1460
1461   /* block for all cpus to have run action_func */
1462   if (mp_bc_ncpus > 1)
1463       thread_block(THREAD_CONTINUE_NULL);
1464   else
1465       clear_wait(current_thread(), THREAD_AWAKENED);
1466
1467   /* release lock */
1468   lck_mtx_unlock(&mp_bc_lock);
1469}
1470
1471void
1472mp_cpus_kick(cpumask_t cpus)
1473{
1474	cpu_t		cpu;
1475	boolean_t	intrs_enabled = FALSE;
1476
1477	intrs_enabled = ml_set_interrupts_enabled(FALSE);
1478	mp_safe_spin_lock(&x86_topo_lock);
1479
1480	for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) {
1481		if ((cpu == (cpu_t) cpu_number())
1482			|| ((cpu_to_cpumask(cpu) & cpus) == 0)
1483			|| (!cpu_datap(cpu)->cpu_running))
1484		{
1485				continue;
1486		}
1487
1488		lapic_send_ipi(cpu, LAPIC_VECTOR(KICK));
1489	}
1490
1491	simple_unlock(&x86_topo_lock);
1492	ml_set_interrupts_enabled(intrs_enabled);
1493}
1494
1495void
1496i386_activate_cpu(void)
1497{
1498	cpu_data_t	*cdp = current_cpu_datap();
1499
1500	assert(!ml_get_interrupts_enabled());
1501
1502	if (!smp_initialized) {
1503		cdp->cpu_running = TRUE;
1504		return;
1505	}
1506
1507	simple_lock(&x86_topo_lock);
1508	cdp->cpu_running = TRUE;
1509	started_cpu();
1510	simple_unlock(&x86_topo_lock);
1511	flush_tlb_raw();
1512}
1513
1514void
1515i386_deactivate_cpu(void)
1516{
1517	cpu_data_t	*cdp = current_cpu_datap();
1518
1519	assert(!ml_get_interrupts_enabled());
1520
1521	KERNEL_DEBUG_CONSTANT(
1522		TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_START,
1523		0, 0, 0, 0, 0);
1524
1525	simple_lock(&x86_topo_lock);
1526	cdp->cpu_running = FALSE;
1527	simple_unlock(&x86_topo_lock);
1528
1529	/*
1530	 * Move all of this cpu's timers to the master/boot cpu,
1531	 * and poke it in case there's a sooner deadline for it to schedule.
1532	 */
1533	timer_queue_shutdown(&cdp->rtclock_timer.queue);
1534	mp_cpus_call(cpu_to_cpumask(master_cpu), ASYNC, timer_queue_expire_local, NULL);
1535
1536	/*
1537	 * Open an interrupt window
1538	 * and ensure any pending IPI or timer is serviced
1539	 */
1540	mp_disable_preemption();
1541	ml_set_interrupts_enabled(TRUE);
1542
1543	while (cdp->cpu_signals && x86_lcpu()->rtcDeadline != EndOfAllTime)
1544		cpu_pause();
1545	/*
1546	 * Ensure there's no remaining timer deadline set
1547	 * - AICPM may have left one active.
1548	 */
1549	setPop(0);
1550
1551	ml_set_interrupts_enabled(FALSE);
1552	mp_enable_preemption();
1553
1554	KERNEL_DEBUG_CONSTANT(
1555		TRACE_MP_CPU_DEACTIVATE | DBG_FUNC_END,
1556		0, 0, 0, 0, 0);
1557}
1558
1559int	pmsafe_debug	= 1;
1560
1561#if	MACH_KDP
1562volatile boolean_t	mp_kdp_trap = FALSE;
1563volatile unsigned long	mp_kdp_ncpus;
1564boolean_t		mp_kdp_state;
1565
1566
1567void
1568mp_kdp_enter(void)
1569{
1570	unsigned int	cpu;
1571	unsigned int	ncpus = 0;
1572	unsigned int	my_cpu;
1573	uint64_t	tsc_timeout;
1574
1575	DBG("mp_kdp_enter()\n");
1576
1577#if DEBUG
1578	if (!smp_initialized)
1579		simple_lock_init(&mp_kdp_lock, 0);
1580#endif
1581
1582	/*
1583	 * Here to enter the debugger.
1584	 * In case of races, only one cpu is allowed to enter kdp after
1585	 * stopping others.
1586	 */
1587	mp_kdp_state = ml_set_interrupts_enabled(FALSE);
1588	my_cpu = cpu_number();
1589
1590	if (my_cpu == (unsigned) debugger_cpu) {
1591		kprintf("\n\nRECURSIVE DEBUGGER ENTRY DETECTED\n\n");
1592		kdp_reset();
1593		return;
1594	}
1595
1596	cpu_datap(my_cpu)->debugger_entry_time = mach_absolute_time();
1597	simple_lock(&mp_kdp_lock);
1598
1599	if (pmsafe_debug && !kdp_snapshot)
1600	    pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_SAFE);
1601
1602	while (mp_kdp_trap) {
1603		simple_unlock(&mp_kdp_lock);
1604		DBG("mp_kdp_enter() race lost\n");
1605#if MACH_KDP
1606		mp_kdp_wait(TRUE, FALSE);
1607#endif
1608		simple_lock(&mp_kdp_lock);
1609	}
1610	debugger_cpu = my_cpu;
1611	ncpus = 1;
1612	mp_kdp_ncpus = 1;	/* self */
1613	mp_kdp_trap = TRUE;
1614	debugger_entry_time = cpu_datap(my_cpu)->debugger_entry_time;
1615	simple_unlock(&mp_kdp_lock);
1616
1617	/*
1618	 * Deliver a nudge to other cpus, counting how many
1619	 */
1620	DBG("mp_kdp_enter() signaling other processors\n");
1621	if (force_immediate_debugger_NMI == FALSE) {
1622		for (cpu = 0; cpu < real_ncpus; cpu++) {
1623			if (cpu == my_cpu || !cpu_datap(cpu)->cpu_running)
1624				continue;
1625			ncpus++;
1626			i386_signal_cpu(cpu, MP_KDP, ASYNC);
1627		}
1628		/*
1629		 * Wait other processors to synchronize
1630		 */
1631		DBG("mp_kdp_enter() waiting for (%d) processors to suspend\n", ncpus);
1632
1633		/*
1634		 * This timeout is rather arbitrary; we don't want to NMI
1635		 * processors that are executing at potentially
1636		 * "unsafe-to-interrupt" points such as the trampolines,
1637		 * but neither do we want to lose state by waiting too long.
1638		 */
1639		tsc_timeout = rdtsc64() + (ncpus * 1000 * 1000 * 10ULL);
1640
1641		if (virtualized)
1642			tsc_timeout = ~0ULL;
1643
1644		while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) {
1645			/*
1646			 * A TLB shootdown request may be pending--this would
1647			 * result in the requesting processor waiting in
1648			 * PMAP_UPDATE_TLBS() until this processor deals with it.
1649			 * Process it, so it can now enter mp_kdp_wait()
1650			 */
1651			handle_pending_TLB_flushes();
1652			cpu_pause();
1653		}
1654		/* If we've timed out, and some processor(s) are still unresponsive,
1655		 * interrupt them with an NMI via the local APIC.
1656		 */
1657		if (mp_kdp_ncpus != ncpus) {
1658			for (cpu = 0; cpu < real_ncpus; cpu++) {
1659				if (cpu == my_cpu || !cpu_datap(cpu)->cpu_running)
1660					continue;
1661				if (cpu_signal_pending(cpu, MP_KDP))
1662					cpu_NMI_interrupt(cpu);
1663			}
1664		}
1665	}
1666	else
1667		for (cpu = 0; cpu < real_ncpus; cpu++) {
1668			if (cpu == my_cpu || !cpu_datap(cpu)->cpu_running)
1669				continue;
1670			cpu_NMI_interrupt(cpu);
1671		}
1672
1673	DBG("mp_kdp_enter() %d processors done %s\n",
1674	    (int)mp_kdp_ncpus, (mp_kdp_ncpus == ncpus) ? "OK" : "timed out");
1675
1676	postcode(MP_KDP_ENTER);
1677}
1678
1679static boolean_t
1680cpu_signal_pending(int cpu, mp_event_t event)
1681{
1682	volatile int	*signals = &cpu_datap(cpu)->cpu_signals;
1683	boolean_t retval = FALSE;
1684
1685	if (i_bit(event, signals))
1686		retval = TRUE;
1687	return retval;
1688}
1689
1690long kdp_x86_xcpu_invoke(const uint16_t lcpu, kdp_x86_xcpu_func_t func,
1691			 void *arg0, void *arg1)
1692{
1693	if (lcpu > (real_ncpus - 1))
1694		return -1;
1695
1696        if (func == NULL)
1697		return -1;
1698
1699	kdp_xcpu_call_func.func = func;
1700        kdp_xcpu_call_func.ret  = -1;
1701	kdp_xcpu_call_func.arg0 = arg0;
1702	kdp_xcpu_call_func.arg1 = arg1;
1703	kdp_xcpu_call_func.cpu  = lcpu;
1704	DBG("Invoking function %p on CPU %d\n", func, (int32_t)lcpu);
1705	while (kdp_xcpu_call_func.cpu != KDP_XCPU_NONE)
1706		cpu_pause();
1707        return kdp_xcpu_call_func.ret;
1708}
1709
1710static void
1711kdp_x86_xcpu_poll(void)
1712{
1713	if ((uint16_t)cpu_number() == kdp_xcpu_call_func.cpu) {
1714            kdp_xcpu_call_func.ret =
1715		    kdp_xcpu_call_func.func(kdp_xcpu_call_func.arg0,
1716					    kdp_xcpu_call_func.arg1,
1717					    cpu_number());
1718		kdp_xcpu_call_func.cpu = KDP_XCPU_NONE;
1719	}
1720}
1721
1722static void
1723mp_kdp_wait(boolean_t flush, boolean_t isNMI)
1724{
1725	DBG("mp_kdp_wait()\n");
1726	/* If an I/O port has been specified as a debugging aid, issue a read */
1727	panic_io_port_read();
1728	current_cpu_datap()->debugger_ipi_time = mach_absolute_time();
1729#if CONFIG_MCA
1730	/* If we've trapped due to a machine-check, save MCA registers */
1731	mca_check_save();
1732#endif
1733
1734	atomic_incl((volatile long *)&mp_kdp_ncpus, 1);
1735	while (mp_kdp_trap || (isNMI == TRUE)) {
1736	        /*
1737		 * A TLB shootdown request may be pending--this would result
1738		 * in the requesting processor waiting in PMAP_UPDATE_TLBS()
1739		 * until this processor handles it.
1740		 * Process it, so it can now enter mp_kdp_wait()
1741		 */
1742		if (flush)
1743			handle_pending_TLB_flushes();
1744
1745		kdp_x86_xcpu_poll();
1746		cpu_pause();
1747	}
1748
1749	atomic_decl((volatile long *)&mp_kdp_ncpus, 1);
1750	DBG("mp_kdp_wait() done\n");
1751}
1752
1753void
1754mp_kdp_exit(void)
1755{
1756	DBG("mp_kdp_exit()\n");
1757	debugger_cpu = -1;
1758	atomic_decl((volatile long *)&mp_kdp_ncpus, 1);
1759
1760	debugger_exit_time = mach_absolute_time();
1761
1762	mp_kdp_trap = FALSE;
1763	mfence();
1764
1765	/* Wait other processors to stop spinning. XXX needs timeout */
1766	DBG("mp_kdp_exit() waiting for processors to resume\n");
1767	while (mp_kdp_ncpus > 0) {
1768	        /*
1769		 * a TLB shootdown request may be pending... this would result in the requesting
1770		 * processor waiting in PMAP_UPDATE_TLBS() until this processor deals with it.
1771		 * Process it, so it can now enter mp_kdp_wait()
1772		 */
1773	        handle_pending_TLB_flushes();
1774
1775		cpu_pause();
1776	}
1777
1778	if (pmsafe_debug && !kdp_snapshot)
1779	    pmSafeMode(&current_cpu_datap()->lcpu, PM_SAFE_FL_NORMAL);
1780
1781	debugger_exit_time = mach_absolute_time();
1782
1783	DBG("mp_kdp_exit() done\n");
1784	(void) ml_set_interrupts_enabled(mp_kdp_state);
1785	postcode(0);
1786}
1787#endif	/* MACH_KDP */
1788
1789boolean_t
1790mp_recent_debugger_activity() {
1791	uint64_t abstime = mach_absolute_time();
1792	return (((abstime - debugger_entry_time) < LastDebuggerEntryAllowance) ||
1793	    ((abstime - debugger_exit_time) < LastDebuggerEntryAllowance));
1794}
1795
1796/*ARGSUSED*/
1797void
1798init_ast_check(
1799	__unused processor_t	processor)
1800{
1801}
1802
1803void
1804cause_ast_check(
1805	processor_t	processor)
1806{
1807	int	cpu = processor->cpu_id;
1808
1809	if (cpu != cpu_number()) {
1810		i386_signal_cpu(cpu, MP_AST, ASYNC);
1811		KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_AST), cpu, 1, 0, 0, 0);
1812	}
1813}
1814
1815void
1816slave_machine_init(void *param)
1817{
1818	/*
1819 	 * Here in process context, but with interrupts disabled.
1820	 */
1821	DBG("slave_machine_init() CPU%d\n", get_cpu_number());
1822
1823	if (param == FULL_SLAVE_INIT) {
1824		/*
1825		 * Cold start
1826		 */
1827		clock_init();
1828	}
1829	cpu_machine_init();	/* Interrupts enabled hereafter */
1830}
1831
1832#undef cpu_number
1833int cpu_number(void)
1834{
1835	return get_cpu_number();
1836}
1837
1838static void
1839cpu_prewarm_init()
1840{
1841	int i;
1842
1843	simple_lock_init(&cpu_warm_lock, 0);
1844	queue_init(&cpu_warm_call_list);
1845	for (i = 0; i < NUM_CPU_WARM_CALLS; i++) {
1846		enqueue_head(&cpu_warm_call_list, (queue_entry_t)&cpu_warm_call_arr[i]);
1847	}
1848}
1849
1850static timer_call_t
1851grab_warm_timer_call()
1852{
1853	spl_t x;
1854	timer_call_t call = NULL;
1855
1856	x = splsched();
1857	simple_lock(&cpu_warm_lock);
1858	if (!queue_empty(&cpu_warm_call_list)) {
1859		call = (timer_call_t) dequeue_head(&cpu_warm_call_list);
1860	}
1861	simple_unlock(&cpu_warm_lock);
1862	splx(x);
1863
1864	return call;
1865}
1866
1867static void
1868free_warm_timer_call(timer_call_t call)
1869{
1870	spl_t x;
1871
1872	x = splsched();
1873	simple_lock(&cpu_warm_lock);
1874	enqueue_head(&cpu_warm_call_list, (queue_entry_t)call);
1875	simple_unlock(&cpu_warm_lock);
1876	splx(x);
1877}
1878
1879/*
1880 * Runs in timer call context (interrupts disabled).
1881 */
1882static void
1883cpu_warm_timer_call_func(
1884		call_entry_param_t p0,
1885		__unused call_entry_param_t p1)
1886{
1887	free_warm_timer_call((timer_call_t)p0);
1888	return;
1889}
1890
1891/*
1892 * Runs with interrupts disabled on the CPU we wish to warm (i.e. CPU 0).
1893 */
1894static void
1895_cpu_warm_setup(
1896		void *arg)
1897{
1898	cpu_warm_data_t cwdp = (cpu_warm_data_t)arg;
1899
1900	timer_call_enter(cwdp->cwd_call, cwdp->cwd_deadline, TIMER_CALL_SYS_CRITICAL | TIMER_CALL_LOCAL);
1901	cwdp->cwd_result = 0;
1902
1903	return;
1904}
1905
1906/*
1907 * Not safe to call with interrupts disabled.
1908 */
1909kern_return_t
1910ml_interrupt_prewarm(
1911	uint64_t 	deadline)
1912{
1913	struct cpu_warm_data cwd;
1914	timer_call_t call;
1915	cpu_t ct;
1916
1917	if (ml_get_interrupts_enabled() == FALSE) {
1918		panic("%s: Interrupts disabled?\n", __FUNCTION__);
1919	}
1920
1921	/*
1922	 * If the platform doesn't need our help, say that we succeeded.
1923	 */
1924	if (!ml_get_interrupt_prewake_applicable()) {
1925		return KERN_SUCCESS;
1926	}
1927
1928	/*
1929	 * Grab a timer call to use.
1930	 */
1931	call = grab_warm_timer_call();
1932	if (call == NULL) {
1933		return KERN_RESOURCE_SHORTAGE;
1934	}
1935
1936	timer_call_setup(call, cpu_warm_timer_call_func, call);
1937	cwd.cwd_call = call;
1938	cwd.cwd_deadline = deadline;
1939	cwd.cwd_result = 0;
1940
1941	/*
1942	 * For now, non-local interrupts happen on the master processor.
1943	 */
1944	ct = mp_cpus_call(cpu_to_cpumask(master_cpu), SYNC, _cpu_warm_setup, &cwd);
1945	if (ct == 0) {
1946		free_warm_timer_call(call);
1947		return KERN_FAILURE;
1948	} else {
1949		return cwd.cwd_result;
1950	}
1951}
1952