1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25/*
26 * Copyright (c) 2009-2010, Intel Corporation.
27 * All rights reserved.
28 */
29
30#include <sys/x86_archext.h>
31#include <sys/machsystm.h>
32#include <sys/x_call.h>
33#include <sys/stat.h>
34#include <sys/acpi/acpi.h>
35#include <sys/acpica.h>
36#include <sys/cpu_acpi.h>
37#include <sys/cpu_idle.h>
38#include <sys/cpupm.h>
39#include <sys/cpu_event.h>
40#include <sys/hpet.h>
41#include <sys/archsystm.h>
42#include <vm/hat_i86.h>
43#include <sys/dtrace.h>
44#include <sys/sdt.h>
45#include <sys/callb.h>
46
47#define	CSTATE_USING_HPET		1
48#define	CSTATE_USING_LAT		2
49
50#define	CPU_IDLE_STOP_TIMEOUT		1000
51
52extern void cpu_idle_adaptive(void);
53extern uint32_t cpupm_next_cstate(cma_c_state_t *cs_data,
54    cpu_acpi_cstate_t *cstates, uint32_t cs_count, hrtime_t start);
55
56static int cpu_idle_init(cpu_t *);
57static void cpu_idle_fini(cpu_t *);
58static void cpu_idle_stop(cpu_t *);
59static boolean_t cpu_deep_idle_callb(void *arg, int code);
60static boolean_t cpu_idle_cpr_callb(void *arg, int code);
61static void acpi_cpu_cstate(cpu_acpi_cstate_t *cstate);
62
63static boolean_t cstate_use_timer(hrtime_t *lapic_expire, int timer);
64
65/*
66 * the flag of always-running local APIC timer.
67 * the flag of HPET Timer use in deep cstate.
68 */
69static boolean_t cpu_cstate_arat = B_FALSE;
70static boolean_t cpu_cstate_hpet = B_FALSE;
71
72/*
73 * Interfaces for modules implementing Intel's deep c-state.
74 */
75cpupm_state_ops_t cpu_idle_ops = {
76	"Generic ACPI C-state Support",
77	cpu_idle_init,
78	cpu_idle_fini,
79	NULL,
80	cpu_idle_stop
81};
82
83static kmutex_t		cpu_idle_callb_mutex;
84static callb_id_t	cpu_deep_idle_callb_id;
85static callb_id_t	cpu_idle_cpr_callb_id;
86static uint_t		cpu_idle_cfg_state;
87
88static kmutex_t cpu_idle_mutex;
89
90cpu_idle_kstat_t cpu_idle_kstat = {
91	{ "address_space_id",	KSTAT_DATA_STRING },
92	{ "latency",		KSTAT_DATA_UINT32 },
93	{ "power",		KSTAT_DATA_UINT32 },
94};
95
96/*
97 * kstat update function of the c-state info
98 */
99static int
100cpu_idle_kstat_update(kstat_t *ksp, int flag)
101{
102	cpu_acpi_cstate_t *cstate = ksp->ks_private;
103
104	if (flag == KSTAT_WRITE) {
105		return (EACCES);
106	}
107
108	if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
109		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
110		"FFixedHW");
111	} else if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_SYSTEM_IO) {
112		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
113		"SystemIO");
114	} else {
115		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
116		"Unsupported");
117	}
118
119	cpu_idle_kstat.cs_latency.value.ui32 = cstate->cs_latency;
120	cpu_idle_kstat.cs_power.value.ui32 = cstate->cs_power;
121
122	return (0);
123}
124
125/*
126 * Used during configuration callbacks to manage implementation specific
127 * details of the hardware timer used during Deep C-state.
128 */
129boolean_t
130cstate_timer_callback(int code)
131{
132	if (cpu_cstate_arat) {
133		return (B_TRUE);
134	} else if (cpu_cstate_hpet) {
135		return (hpet.callback(code));
136	}
137	return (B_FALSE);
138}
139
140/*
141 * Some Local APIC Timers do not work during Deep C-states.
142 * The Deep C-state idle function uses this function to ensure it is using a
143 * hardware timer that works during Deep C-states.  This function also
144 * switches the timer back to the LACPI Timer after Deep C-state.
145 */
146static boolean_t
147cstate_use_timer(hrtime_t *lapic_expire, int timer)
148{
149	if (cpu_cstate_arat)
150		return (B_TRUE);
151
152	/*
153	 * We have to return B_FALSE if no arat or hpet support
154	 */
155	if (!cpu_cstate_hpet)
156		return (B_FALSE);
157
158	switch (timer) {
159	case CSTATE_USING_HPET:
160		return (hpet.use_hpet_timer(lapic_expire));
161	case CSTATE_USING_LAT:
162		hpet.use_lapic_timer(*lapic_expire);
163		return (B_TRUE);
164	default:
165		return (B_FALSE);
166	}
167}
168
169/*
170 * c-state wakeup function.
171 * Similar to cpu_wakeup and cpu_wakeup_mwait except this function deals
172 * with CPUs asleep in MWAIT, HLT, or ACPI Deep C-State.
173 */
174void
175cstate_wakeup(cpu_t *cp, int bound)
176{
177	struct machcpu	*mcpu = &(cp->cpu_m);
178	volatile uint32_t *mcpu_mwait = mcpu->mcpu_mwait;
179	cpupart_t	*cpu_part;
180	uint_t		cpu_found;
181	processorid_t	cpu_sid;
182
183	cpu_part = cp->cpu_part;
184	cpu_sid = cp->cpu_seqid;
185	/*
186	 * Clear the halted bit for that CPU since it will be woken up
187	 * in a moment.
188	 */
189	if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
190		/*
191		 * Clear the halted bit for that CPU since it will be
192		 * poked in a moment.
193		 */
194		bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid);
195
196		/*
197		 * We may find the current CPU present in the halted cpuset
198		 * if we're in the context of an interrupt that occurred
199		 * before we had a chance to clear our bit in cpu_idle().
200		 * Waking ourself is obviously unnecessary, since if
201		 * we're here, we're not halted.
202		 */
203		if (cp != CPU) {
204			/*
205			 * Use correct wakeup mechanism
206			 */
207			if ((mcpu_mwait != NULL) &&
208			    (*mcpu_mwait == MWAIT_HALTED))
209				MWAIT_WAKEUP(cp);
210			else
211				poke_cpu(cp->cpu_id);
212		}
213		return;
214	} else {
215		/*
216		 * This cpu isn't halted, but it's idle or undergoing a
217		 * context switch. No need to awaken anyone else.
218		 */
219		if (cp->cpu_thread == cp->cpu_idle_thread ||
220		    cp->cpu_disp_flags & CPU_DISP_DONTSTEAL)
221			return;
222	}
223
224	/*
225	 * No need to wake up other CPUs if the thread we just enqueued
226	 * is bound.
227	 */
228	if (bound)
229		return;
230
231
232	/*
233	 * See if there's any other halted CPUs. If there are, then
234	 * select one, and awaken it.
235	 * It's possible that after we find a CPU, somebody else
236	 * will awaken it before we get the chance.
237	 * In that case, look again.
238	 */
239	do {
240		cpu_found = bitset_find(&cpu_part->cp_haltset);
241		if (cpu_found == (uint_t)-1)
242			return;
243
244	} while (bitset_atomic_test_and_del(&cpu_part->cp_haltset,
245	    cpu_found) < 0);
246
247	/*
248	 * Must use correct wakeup mechanism to avoid lost wakeup of
249	 * alternate cpu.
250	 */
251	if (cpu_found != CPU->cpu_seqid) {
252		mcpu_mwait = cpu_seq[cpu_found]->cpu_m.mcpu_mwait;
253		if ((mcpu_mwait != NULL) && (*mcpu_mwait == MWAIT_HALTED))
254			MWAIT_WAKEUP(cpu_seq[cpu_found]);
255		else
256			poke_cpu(cpu_seq[cpu_found]->cpu_id);
257	}
258}
259
260/*
261 * Function called by CPU idle notification framework to check whether CPU
262 * has been awakened. It will be called with interrupt disabled.
263 * If CPU has been awakened, call cpu_idle_exit() to notify CPU idle
264 * notification framework.
265 */
266static void
267acpi_cpu_mwait_check_wakeup(void *arg)
268{
269	volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;
270
271	ASSERT(arg != NULL);
272	if (*mcpu_mwait != MWAIT_HALTED) {
273		/*
274		 * CPU has been awakened, notify CPU idle notification system.
275		 */
276		cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
277	} else {
278		/*
279		 * Toggle interrupt flag to detect pending interrupts.
280		 * If interrupt happened, do_interrupt() will notify CPU idle
281		 * notification framework so no need to call cpu_idle_exit()
282		 * here.
283		 */
284		sti();
285		SMT_PAUSE();
286		cli();
287	}
288}
289
290static void
291acpi_cpu_mwait_ipi_check_wakeup(void *arg)
292{
293	volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;
294
295	ASSERT(arg != NULL);
296	if (*mcpu_mwait != MWAIT_WAKEUP_IPI) {
297		/*
298		 * CPU has been awakened, notify CPU idle notification system.
299		 */
300		cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
301	} else {
302		/*
303		 * Toggle interrupt flag to detect pending interrupts.
304		 * If interrupt happened, do_interrupt() will notify CPU idle
305		 * notification framework so no need to call cpu_idle_exit()
306		 * here.
307		 */
308		sti();
309		SMT_PAUSE();
310		cli();
311	}
312}
313
314/*ARGSUSED*/
315static void
316acpi_cpu_check_wakeup(void *arg)
317{
318	/*
319	 * Toggle interrupt flag to detect pending interrupts.
320	 * If interrupt happened, do_interrupt() will notify CPU idle
321	 * notification framework so no need to call cpu_idle_exit() here.
322	 */
323	sti();
324	SMT_PAUSE();
325	cli();
326}
327
328/*
329 * enter deep c-state handler
330 */
331static void
332acpi_cpu_cstate(cpu_acpi_cstate_t *cstate)
333{
334	volatile uint32_t	*mcpu_mwait = CPU->cpu_m.mcpu_mwait;
335	cpu_t			*cpup = CPU;
336	processorid_t		cpu_sid = cpup->cpu_seqid;
337	cpupart_t		*cp = cpup->cpu_part;
338	hrtime_t		lapic_expire;
339	uint8_t			type = cstate->cs_addrspace_id;
340	uint32_t		cs_type = cstate->cs_type;
341	int			hset_update = 1;
342	boolean_t		using_timer;
343	cpu_idle_check_wakeup_t check_func = &acpi_cpu_check_wakeup;
344
345	/*
346	 * Set our mcpu_mwait here, so we can tell if anyone tries to
347	 * wake us between now and when we call mwait.  No other cpu will
348	 * attempt to set our mcpu_mwait until we add ourself to the haltset.
349	 */
350	if (mcpu_mwait) {
351		if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
352			*mcpu_mwait = MWAIT_WAKEUP_IPI;
353			check_func = &acpi_cpu_mwait_ipi_check_wakeup;
354		} else {
355			*mcpu_mwait = MWAIT_HALTED;
356			check_func = &acpi_cpu_mwait_check_wakeup;
357		}
358	}
359
360	/*
361	 * If this CPU is online, and there are multiple CPUs
362	 * in the system, then we should note our halting
363	 * by adding ourselves to the partition's halted CPU
364	 * bitmap. This allows other CPUs to find/awaken us when
365	 * work becomes available.
366	 */
367	if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
368		hset_update = 0;
369
370	/*
371	 * Add ourselves to the partition's halted CPUs bitmask
372	 * and set our HALTED flag, if necessary.
373	 *
374	 * When a thread becomes runnable, it is placed on the queue
375	 * and then the halted cpuset is checked to determine who
376	 * (if anyone) should be awakened. We therefore need to first
377	 * add ourselves to the halted cpuset, and and then check if there
378	 * is any work available.
379	 *
380	 * Note that memory barriers after updating the HALTED flag
381	 * are not necessary since an atomic operation (updating the bitmap)
382	 * immediately follows. On x86 the atomic operation acts as a
383	 * memory barrier for the update of cpu_disp_flags.
384	 */
385	if (hset_update) {
386		cpup->cpu_disp_flags |= CPU_DISP_HALTED;
387		bitset_atomic_add(&cp->cp_haltset, cpu_sid);
388	}
389
390	/*
391	 * Check to make sure there's really nothing to do.
392	 * Work destined for this CPU may become available after
393	 * this check. We'll be notified through the clearing of our
394	 * bit in the halted CPU bitmask, and a write to our mcpu_mwait.
395	 *
396	 * disp_anywork() checks disp_nrunnable, so we do not have to later.
397	 */
398	if (disp_anywork()) {
399		if (hset_update) {
400			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
401			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
402		}
403		return;
404	}
405
406	/*
407	 * We're on our way to being halted.
408	 *
409	 * The local APIC timer can stop in ACPI C2 and deeper c-states.
410	 * Try to program the HPET hardware to substitute for this CPU's
411	 * LAPIC timer.
412	 * cstate_use_timer() could disable the LAPIC Timer.  Make sure
413	 * to start the LAPIC Timer again before leaving this function.
414	 *
415	 * Disable interrupts here so we will awaken immediately after halting
416	 * if someone tries to poke us between now and the time we actually
417	 * halt.
418	 */
419	cli();
420	using_timer = cstate_use_timer(&lapic_expire, CSTATE_USING_HPET);
421
422	/*
423	 * We check for the presence of our bit after disabling interrupts.
424	 * If it's cleared, we'll return. If the bit is cleared after
425	 * we check then the cstate_wakeup() will pop us out of the halted
426	 * state.
427	 *
428	 * This means that the ordering of the cstate_wakeup() and the clearing
429	 * of the bit by cpu_wakeup is important.
430	 * cpu_wakeup() must clear our mc_haltset bit, and then call
431	 * cstate_wakeup().
432	 * acpi_cpu_cstate() must disable interrupts, then check for the bit.
433	 */
434	if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) {
435		(void) cstate_use_timer(&lapic_expire,
436		    CSTATE_USING_LAT);
437		sti();
438		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
439		return;
440	}
441
442	/*
443	 * The check for anything locally runnable is here for performance
444	 * and isn't needed for correctness. disp_nrunnable ought to be
445	 * in our cache still, so it's inexpensive to check, and if there
446	 * is anything runnable we won't have to wait for the poke.
447	 */
448	if (cpup->cpu_disp->disp_nrunnable != 0) {
449		(void) cstate_use_timer(&lapic_expire,
450		    CSTATE_USING_LAT);
451		sti();
452		if (hset_update) {
453			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
454			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
455		}
456		return;
457	}
458
459	if (using_timer == B_FALSE) {
460
461		(void) cstate_use_timer(&lapic_expire,
462		    CSTATE_USING_LAT);
463		sti();
464
465		/*
466		 * We are currently unable to program the HPET to act as this
467		 * CPU's proxy LAPIC timer.  This CPU cannot enter C2 or deeper
468		 * because no timer is set to wake it up while its LAPIC timer
469		 * stalls in deep C-States.
470		 * Enter C1 instead.
471		 *
472		 * cstate_wake_cpu() will wake this CPU with an IPI which
473		 * works with MWAIT.
474		 */
475		i86_monitor(mcpu_mwait, 0, 0);
476		if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) == MWAIT_HALTED) {
477			if (cpu_idle_enter(IDLE_STATE_C1, 0,
478			    check_func, (void *)mcpu_mwait) == 0) {
479				if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) ==
480				    MWAIT_HALTED) {
481					i86_mwait(0, 0);
482				}
483				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
484			}
485		}
486
487		/*
488		 * We're no longer halted
489		 */
490		if (hset_update) {
491			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
492			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
493		}
494		return;
495	}
496
497	if (type == ACPI_ADR_SPACE_FIXED_HARDWARE) {
498		/*
499		 * We're on our way to being halted.
500		 * To avoid a lost wakeup, arm the monitor before checking
501		 * if another cpu wrote to mcpu_mwait to wake us up.
502		 */
503		i86_monitor(mcpu_mwait, 0, 0);
504		if (*mcpu_mwait == MWAIT_HALTED) {
505			if (cpu_idle_enter((uint_t)cs_type, 0,
506			    check_func, (void *)mcpu_mwait) == 0) {
507				if (*mcpu_mwait == MWAIT_HALTED) {
508					i86_mwait(cstate->cs_address, 1);
509				}
510				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
511			}
512		}
513	} else if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
514		uint32_t value;
515		ACPI_TABLE_FADT *gbl_FADT;
516
517		if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
518			if (cpu_idle_enter((uint_t)cs_type, 0,
519			    check_func, (void *)mcpu_mwait) == 0) {
520				if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
521					(void) cpu_acpi_read_port(
522					    cstate->cs_address, &value, 8);
523					acpica_get_global_FADT(&gbl_FADT);
524					(void) cpu_acpi_read_port(
525					    gbl_FADT->XPmTimerBlock.Address,
526					    &value, 32);
527				}
528				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
529			}
530		}
531	}
532
533	/*
534	 * The LAPIC timer may have stopped in deep c-state.
535	 * Reprogram this CPU's LAPIC here before enabling interrupts.
536	 */
537	(void) cstate_use_timer(&lapic_expire, CSTATE_USING_LAT);
538	sti();
539
540	/*
541	 * We're no longer halted
542	 */
543	if (hset_update) {
544		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
545		bitset_atomic_del(&cp->cp_haltset, cpu_sid);
546	}
547}
548
549/*
550 * Idle the present CPU, deep c-state is supported
551 */
552void
553cpu_acpi_idle(void)
554{
555	cpu_t *cp = CPU;
556	cpu_acpi_handle_t handle;
557	cma_c_state_t *cs_data;
558	cpu_acpi_cstate_t *cstates;
559	hrtime_t start, end;
560	int cpu_max_cstates;
561	uint32_t cs_indx;
562	uint16_t cs_type;
563
564	cpupm_mach_state_t *mach_state =
565	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
566	handle = mach_state->ms_acpi_handle;
567	ASSERT(CPU_ACPI_CSTATES(handle) != NULL);
568
569	cs_data = mach_state->ms_cstate.cma_state.cstate;
570	cstates = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
571	ASSERT(cstates != NULL);
572	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
573	if (cpu_max_cstates > CPU_MAX_CSTATES)
574		cpu_max_cstates = CPU_MAX_CSTATES;
575	if (cpu_max_cstates == 1) {	/* no ACPI c-state data */
576		(*non_deep_idle_cpu)();
577		return;
578	}
579
580	start = gethrtime_unscaled();
581
582	cs_indx = cpupm_next_cstate(cs_data, cstates, cpu_max_cstates, start);
583
584	cs_type = cstates[cs_indx].cs_type;
585
586	switch (cs_type) {
587	default:
588		/* FALLTHROUGH */
589	case CPU_ACPI_C1:
590		(*non_deep_idle_cpu)();
591		break;
592
593	case CPU_ACPI_C2:
594		acpi_cpu_cstate(&cstates[cs_indx]);
595		break;
596
597	case CPU_ACPI_C3:
598		/*
599		 * All supported Intel processors maintain cache coherency
600		 * during C3.  Currently when entering C3 processors flush
601		 * core caches to higher level shared cache. The shared cache
602		 * maintains state and supports probes during C3.
603		 * Consequently there is no need to handle cache coherency
604		 * and Bus Master activity here with the cache flush, BM_RLD
605		 * bit, BM_STS bit, nor PM2_CNT.ARB_DIS mechanisms described
606		 * in section 8.1.4 of the ACPI Specification 4.0.
607		 */
608		acpi_cpu_cstate(&cstates[cs_indx]);
609		break;
610	}
611
612	end = gethrtime_unscaled();
613
614	/*
615	 * Update statistics
616	 */
617	cpupm_wakeup_cstate_data(cs_data, end);
618}
619
620boolean_t
621cpu_deep_cstates_supported(void)
622{
623	extern int	idle_cpu_no_deep_c;
624
625	if (idle_cpu_no_deep_c)
626		return (B_FALSE);
627
628	if (!cpuid_deep_cstates_supported())
629		return (B_FALSE);
630
631	if (cpuid_arat_supported()) {
632		cpu_cstate_arat = B_TRUE;
633		return (B_TRUE);
634	}
635
636	if ((hpet.supported == HPET_FULL_SUPPORT) &&
637	    hpet.install_proxy()) {
638		cpu_cstate_hpet = B_TRUE;
639		return (B_TRUE);
640	}
641
642	return (B_FALSE);
643}
644
645/*
646 * Validate that this processor supports deep cstate and if so,
647 * get the c-state data from ACPI and cache it.
648 */
649static int
650cpu_idle_init(cpu_t *cp)
651{
652	cpupm_mach_state_t *mach_state =
653	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
654	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
655	cpu_acpi_cstate_t *cstate;
656	char name[KSTAT_STRLEN];
657	int cpu_max_cstates, i;
658	int ret;
659
660	/*
661	 * Cache the C-state specific ACPI data.
662	 */
663	if ((ret = cpu_acpi_cache_cstate_data(handle)) != 0) {
664		if (ret < 0)
665			cmn_err(CE_NOTE,
666			    "!Support for CPU deep idle states is being "
667			    "disabled due to errors parsing ACPI C-state "
668			    "objects exported by BIOS.");
669		cpu_idle_fini(cp);
670		return (-1);
671	}
672
673	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
674
675	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
676
677	for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
678		(void) snprintf(name, KSTAT_STRLEN - 1, "c%d", cstate->cs_type);
679		/*
680		 * Allocate, initialize and install cstate kstat
681		 */
682		cstate->cs_ksp = kstat_create("cstate", cp->cpu_id,
683		    name, "misc",
684		    KSTAT_TYPE_NAMED,
685		    sizeof (cpu_idle_kstat) / sizeof (kstat_named_t),
686		    KSTAT_FLAG_VIRTUAL);
687
688		if (cstate->cs_ksp == NULL) {
689			cmn_err(CE_NOTE, "kstat_create(c_state) fail");
690		} else {
691			cstate->cs_ksp->ks_data = &cpu_idle_kstat;
692			cstate->cs_ksp->ks_lock = &cpu_idle_mutex;
693			cstate->cs_ksp->ks_update = cpu_idle_kstat_update;
694			cstate->cs_ksp->ks_data_size += MAXNAMELEN;
695			cstate->cs_ksp->ks_private = cstate;
696			kstat_install(cstate->cs_ksp);
697		}
698		cstate++;
699	}
700
701	cpupm_alloc_domains(cp, CPUPM_C_STATES);
702	cpupm_alloc_ms_cstate(cp);
703
704	if (cpu_deep_cstates_supported()) {
705		uint32_t value;
706
707		mutex_enter(&cpu_idle_callb_mutex);
708		if (cpu_deep_idle_callb_id == (callb_id_t)0)
709			cpu_deep_idle_callb_id = callb_add(&cpu_deep_idle_callb,
710			    (void *)NULL, CB_CL_CPU_DEEP_IDLE, "cpu_deep_idle");
711		if (cpu_idle_cpr_callb_id == (callb_id_t)0)
712			cpu_idle_cpr_callb_id = callb_add(&cpu_idle_cpr_callb,
713			    (void *)NULL, CB_CL_CPR_PM, "cpu_idle_cpr");
714		mutex_exit(&cpu_idle_callb_mutex);
715
716
717		/*
718		 * All supported CPUs (Nehalem and later) will remain in C3
719		 * during Bus Master activity.
720		 * All CPUs set ACPI_BITREG_BUS_MASTER_RLD to 0 here if it
721		 * is not already 0 before enabling Deeper C-states.
722		 */
723		cpu_acpi_get_register(ACPI_BITREG_BUS_MASTER_RLD, &value);
724		if (value & 1)
725			cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
726	}
727
728	return (0);
729}
730
731/*
732 * Free resources allocated by cpu_idle_init().
733 */
734static void
735cpu_idle_fini(cpu_t *cp)
736{
737	cpupm_mach_state_t *mach_state =
738	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
739	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
740	cpu_acpi_cstate_t *cstate;
741	uint_t	cpu_max_cstates, i;
742
743	/*
744	 * idle cpu points back to the generic one
745	 */
746	idle_cpu = cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
747	disp_enq_thread = non_deep_idle_disp_enq_thread;
748
749	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
750	if (cstate) {
751		cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
752
753		for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
754			if (cstate->cs_ksp != NULL)
755				kstat_delete(cstate->cs_ksp);
756			cstate++;
757		}
758	}
759
760	cpupm_free_ms_cstate(cp);
761	cpupm_free_domains(&cpupm_cstate_domains);
762	cpu_acpi_free_cstate_data(handle);
763
764	mutex_enter(&cpu_idle_callb_mutex);
765	if (cpu_deep_idle_callb_id != (callb_id_t)0) {
766		(void) callb_delete(cpu_deep_idle_callb_id);
767		cpu_deep_idle_callb_id = (callb_id_t)0;
768	}
769	if (cpu_idle_cpr_callb_id != (callb_id_t)0) {
770		(void) callb_delete(cpu_idle_cpr_callb_id);
771		cpu_idle_cpr_callb_id = (callb_id_t)0;
772	}
773	mutex_exit(&cpu_idle_callb_mutex);
774}
775
776/*
777 * This function is introduced here to solve a race condition
778 * between the master and the slave to touch c-state data structure.
779 * After the slave calls this idle function to switch to the non
780 * deep idle function, the master can go on to reclaim the resource.
781 */
782static void
783cpu_idle_stop_sync(void)
784{
785	/* switch to the non deep idle function */
786	CPU->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
787}
788
789static void
790cpu_idle_stop(cpu_t *cp)
791{
792	cpupm_mach_state_t *mach_state =
793	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
794	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
795	cpu_acpi_cstate_t *cstate;
796	uint_t cpu_max_cstates, i = 0;
797
798	mutex_enter(&cpu_idle_callb_mutex);
799	if (idle_cpu == cpu_idle_adaptive) {
800		/*
801		 * invoke the slave to call synchronous idle function.
802		 */
803		cp->cpu_m.mcpu_idle_cpu = cpu_idle_stop_sync;
804		poke_cpu(cp->cpu_id);
805
806		/*
807		 * wait until the slave switchs to non deep idle function,
808		 * so that the master is safe to go on to reclaim the resource.
809		 */
810		while (cp->cpu_m.mcpu_idle_cpu != non_deep_idle_cpu) {
811			drv_usecwait(10);
812			if ((++i % CPU_IDLE_STOP_TIMEOUT) == 0)
813				cmn_err(CE_NOTE, "!cpu_idle_stop: the slave"
814				    " idle stop timeout");
815		}
816	}
817	mutex_exit(&cpu_idle_callb_mutex);
818
819	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
820	if (cstate) {
821		cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
822
823		for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
824			if (cstate->cs_ksp != NULL)
825				kstat_delete(cstate->cs_ksp);
826			cstate++;
827		}
828	}
829	cpupm_free_ms_cstate(cp);
830	cpupm_remove_domains(cp, CPUPM_C_STATES, &cpupm_cstate_domains);
831	cpu_acpi_free_cstate_data(handle);
832}
833
834/*ARGSUSED*/
835static boolean_t
836cpu_deep_idle_callb(void *arg, int code)
837{
838	boolean_t rslt = B_TRUE;
839
840	mutex_enter(&cpu_idle_callb_mutex);
841	switch (code) {
842	case PM_DEFAULT_CPU_DEEP_IDLE:
843		/*
844		 * Default policy is same as enable
845		 */
846		/*FALLTHROUGH*/
847	case PM_ENABLE_CPU_DEEP_IDLE:
848		if ((cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) == 0)
849			break;
850
851		if (cstate_timer_callback(PM_ENABLE_CPU_DEEP_IDLE)) {
852			disp_enq_thread = cstate_wakeup;
853			idle_cpu = cpu_idle_adaptive;
854			cpu_idle_cfg_state &= ~CPU_IDLE_DEEP_CFG;
855		} else {
856			rslt = B_FALSE;
857		}
858		break;
859
860	case PM_DISABLE_CPU_DEEP_IDLE:
861		if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
862			break;
863
864		idle_cpu = non_deep_idle_cpu;
865		if (cstate_timer_callback(PM_DISABLE_CPU_DEEP_IDLE)) {
866			disp_enq_thread = non_deep_idle_disp_enq_thread;
867			cpu_idle_cfg_state |= CPU_IDLE_DEEP_CFG;
868		}
869		break;
870
871	default:
872		cmn_err(CE_NOTE, "!cpu deep_idle_callb: invalid code %d\n",
873		    code);
874		break;
875	}
876	mutex_exit(&cpu_idle_callb_mutex);
877	return (rslt);
878}
879
880/*ARGSUSED*/
881static boolean_t
882cpu_idle_cpr_callb(void *arg, int code)
883{
884	boolean_t rslt = B_TRUE;
885
886	mutex_enter(&cpu_idle_callb_mutex);
887	switch (code) {
888	case CB_CODE_CPR_RESUME:
889		if (cstate_timer_callback(CB_CODE_CPR_RESUME)) {
890			/*
891			 * Do not enable dispatcher hooks if disabled by user.
892			 */
893			if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
894				break;
895
896			disp_enq_thread = cstate_wakeup;
897			idle_cpu = cpu_idle_adaptive;
898		} else {
899			rslt = B_FALSE;
900		}
901		break;
902
903	case CB_CODE_CPR_CHKPT:
904		idle_cpu = non_deep_idle_cpu;
905		disp_enq_thread = non_deep_idle_disp_enq_thread;
906		(void) cstate_timer_callback(CB_CODE_CPR_CHKPT);
907		break;
908
909	default:
910		cmn_err(CE_NOTE, "!cpudvr cpr_callb: invalid code %d\n", code);
911		break;
912	}
913	mutex_exit(&cpu_idle_callb_mutex);
914	return (rslt);
915}
916
917/*
918 * handle _CST notification
919 */
920void
921cpuidle_cstate_instance(cpu_t *cp)
922{
923#ifndef	__xpv
924	cpupm_mach_state_t	*mach_state =
925	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
926	cpu_acpi_handle_t	handle;
927	struct machcpu		*mcpu;
928	cpuset_t 		dom_cpu_set;
929	kmutex_t		*pm_lock;
930	int			result = 0;
931	processorid_t		cpu_id;
932
933	if (mach_state == NULL) {
934		return;
935	}
936
937	ASSERT(mach_state->ms_cstate.cma_domain != NULL);
938	dom_cpu_set = mach_state->ms_cstate.cma_domain->pm_cpus;
939	pm_lock = &mach_state->ms_cstate.cma_domain->pm_lock;
940
941	/*
942	 * Do for all the CPU's in the domain
943	 */
944	mutex_enter(pm_lock);
945	do {
946		CPUSET_FIND(dom_cpu_set, cpu_id);
947		if (cpu_id == CPUSET_NOTINSET)
948			break;
949
950		ASSERT(cpu_id >= 0 && cpu_id < NCPU);
951		cp = cpu[cpu_id];
952		mach_state = (cpupm_mach_state_t *)
953		    cp->cpu_m.mcpu_pm_mach_state;
954		if (!(mach_state->ms_caps & CPUPM_C_STATES)) {
955			mutex_exit(pm_lock);
956			return;
957		}
958		handle = mach_state->ms_acpi_handle;
959		ASSERT(handle != NULL);
960
961		/*
962		 * re-evaluate cstate object
963		 */
964		if (cpu_acpi_cache_cstate_data(handle) != 0) {
965			cmn_err(CE_WARN, "Cannot re-evaluate the cpu c-state"
966			    " object Instance: %d", cpu_id);
967		}
968		mcpu = &(cp->cpu_m);
969		mcpu->max_cstates = cpu_acpi_get_max_cstates(handle);
970		if (mcpu->max_cstates > CPU_ACPI_C1) {
971			(void) cstate_timer_callback(
972			    CST_EVENT_MULTIPLE_CSTATES);
973			disp_enq_thread = cstate_wakeup;
974			cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
975		} else if (mcpu->max_cstates == CPU_ACPI_C1) {
976			disp_enq_thread = non_deep_idle_disp_enq_thread;
977			cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
978			(void) cstate_timer_callback(CST_EVENT_ONE_CSTATE);
979		}
980
981		CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result);
982	} while (result < 0);
983	mutex_exit(pm_lock);
984#endif
985}
986
987/*
988 * handle the number or the type of available processor power states change
989 */
990void
991cpuidle_manage_cstates(void *ctx)
992{
993	cpu_t			*cp = ctx;
994	cpupm_mach_state_t	*mach_state =
995	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
996	boolean_t		is_ready;
997
998	if (mach_state == NULL) {
999		return;
1000	}
1001
1002	/*
1003	 * We currently refuse to power manage if the CPU is not ready to
1004	 * take cross calls (cross calls fail silently if CPU is not ready
1005	 * for it).
1006	 *
1007	 * Additionally, for x86 platforms we cannot power manage an instance,
1008	 * until it has been initialized.
1009	 */
1010	is_ready = (cp->cpu_flags & CPU_READY) && cpupm_cstate_ready(cp);
1011	if (!is_ready)
1012		return;
1013
1014	cpuidle_cstate_instance(cp);
1015}
1016