cpu_pm.c revision 8906:e559381f1e2b
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/cpu_pm.h>
27#include <sys/cmn_err.h>
28#include <sys/sdt.h>
29
30/*
31 * Solaris Event Based CPU Power Manager
32 *
33 * This file implements platform independent event based CPU power management.
34 * When CPUs are configured into the system, the CMT scheduling subsystem will
35 * query the platform to determine if the CPU belongs to any power management
36 * domains. That is, sets of CPUs that share power management states.
37 *
38 * Active Power Management domains represent a group of CPUs across which the
39 * Operating System can request speed changes (which may in turn result
40 * in voltage changes). This allows the operating system to trade off
41 * performance for power savings.
42 *
43 * Idle Power Management domains can enter power savings states when they are
44 * unutilized. These states allow the Operating System to trade off power
45 * for performance (in the form of latency to transition from the idle state
46 * to an active one).
47 *
48 * For each active and idle power domain the CMT subsystem instantiates, a
49 * cpupm_domain_t structure is created. As the dispatcher schedules threads
50 * to run on the system's CPUs, it will also track the utilization of the
51 * enumerated power domains. Significant changes in utilization will result
52 * in the dispatcher sending the power manager events that relate to the
53 * utilization of the power domain. The power manager recieves the events,
54 * and in the context of the policy objectives in force, may decide to request
55 * the domain's power/performance state be changed.
56 *
57 * Under the "elastic" CPUPM policy, when the utilization rises, the CPU power
58 * manager will request the CPUs in the domain run at their fastest (and most
59 * power consuming) state. When the domain becomes idle (utilization at zero),
60 * the power manager will request that the CPUs run at a speed that saves the
61 * most power.
62 *
63 * The advantage of this scheme, is that the CPU power manager working with the
64 * dispatcher can be extremely responsive to changes in utilization. Optimizing
65 * for performance in the presence of utilization, and power savings in the
66 * presence of idleness. Such close collaboration with the dispatcher has other
67 * benefits that will play out in the form of more sophisticated power /
68 * performance policy in the near future.
69 *
70 * Avoiding state thrashing in the presence of transient periods of utilization
71 * and idleness while still being responsive to non-transient periods is key.
72 * The power manager implmeents several "governors" that are used to throttle
73 * state transitions when a significant amount of transient idle or transient
74 * work is detected.
75 *
76 * Kernel background activity (e.g. taskq threads) are by far the most common
77 * form of transient utilization. Ungoverned in the face of this utililzation,
78 * hundreds of state transitions per second would result on an idle system.
79 *
80 * Transient idleness is common when a thread briefly yields the CPU to
81 * wait for an event elsewhere in the system. Where the idle period is short
82 * enough, the overhead associated with making the state transition doesn't
83 * justify the power savings.
84 */
85
86static cpupm_domain_t *cpupm_domains = NULL;
87
88/*
89 * Uninitialized state of CPU power management is disabled
90 */
91cpupm_policy_t cpupm_policy = CPUPM_POLICY_DISABLED;
92
93/*
94 * Periods of utilization lasting less than this time interval are characterized
95 * as transient. State changes associated with transient work are considered
96 * to be mispredicted. That is, it's not worth raising and lower power states
97 * where the utilization lasts for less than this interval.
98 */
99hrtime_t cpupm_tw_predict_interval;
100
101/*
102 * Periods of idleness lasting less than this time interval are characterized
103 * as transient. State changes associated with transient idle are considered
104 * to be mispredicted. That is, it's not worth lowering and raising power
105 * states where the idleness lasts for less than this interval.
106 */
107hrtime_t cpupm_ti_predict_interval;
108
109/*
110 * Number of mispredictions after which future transitions will be governed.
111 */
112int cpupm_mispredict_thresh = 2;
113
114/*
115 * Likewise, the number of mispredicted governed transitions after which the
116 * governor will be removed.
117 */
118int cpupm_mispredict_gov_thresh = 10;
119
120/*
121 * The transient work and transient idle prediction intervals are initialized
122 * to be some multiple of the amount of time it takes to transition a power
123 * domain from the highest to the lowest power state, and back again, which
124 * is measured.
125 *
126 * The default values of those multiples are specified here. Tuning them higher
127 * will result in the transient work, and transient idle governors being used
128 * more aggresively, which limits the frequency of state transitions at the
129 * expense of performance and power savings, respectively.
130 */
131#define	CPUPM_TI_GOV_DEFAULT_MULTIPLE 600
132#define	CPUPM_TW_GOV_DEFAULT_MULTIPLE 25
133
134/*
135 * Number of high=>low=>high measurements performed, of which the average
136 * is taken.
137 */
138#define	CPUPM_BENCHMARK_ITERS 5
139
140int cpupm_ti_gov_multiple = CPUPM_TI_GOV_DEFAULT_MULTIPLE;
141int cpupm_tw_gov_multiple = CPUPM_TW_GOV_DEFAULT_MULTIPLE;
142
143
144static int	cpupm_governor_initialize(void);
145static void	cpupm_state_change_global(cpupm_dtype_t, cpupm_state_name_t);
146
147cpupm_policy_t
148cpupm_get_policy(void)
149{
150	return (cpupm_policy);
151}
152
153int
154cpupm_set_policy(cpupm_policy_t new_policy)
155{
156	static int	gov_init = 0;
157	int		result = 0;
158
159	mutex_enter(&cpu_lock);
160	if (new_policy == cpupm_policy) {
161		mutex_exit(&cpu_lock);
162		return (result);
163	}
164
165	/*
166	 * Pausing CPUs causes a high priority thread to be scheduled
167	 * on all other CPUs (besides the current one). This locks out
168	 * other CPUs from making CPUPM state transitions.
169	 */
170	switch (new_policy) {
171	case CPUPM_POLICY_DISABLED:
172		pause_cpus(NULL);
173		cpupm_policy = CPUPM_POLICY_DISABLED;
174		start_cpus();
175
176		result = cmt_pad_disable(PGHW_POW_ACTIVE);
177
178		/*
179		 * Once PAD has been enabled, it should always be possible
180		 * to disable it.
181		 */
182		ASSERT(result == 0);
183
184		/*
185		 * Bring all the active power domains to the maximum
186		 * performance state.
187		 */
188		cpupm_state_change_global(CPUPM_DTYPE_ACTIVE,
189		    CPUPM_STATE_MAX_PERF);
190
191		break;
192	case CPUPM_POLICY_ELASTIC:
193
194		result = cmt_pad_enable(PGHW_POW_ACTIVE);
195		if (result < 0) {
196			/*
197			 * Failed to enable PAD across the active power
198			 * domains, which may well be because none were
199			 * enumerated.
200			 */
201			break;
202		}
203
204		pause_cpus(NULL);
205		/*
206		 * Attempt to initialize the governor parameters the first
207		 * time through.
208		 */
209		if (gov_init == 0) {
210			result = cpupm_governor_initialize();
211			if (result == 0) {
212				gov_init = 1;
213			} else {
214				/*
215				 * Failed to initialize the governor parameters
216				 */
217				start_cpus();
218				break;
219			}
220		}
221		cpupm_policy = CPUPM_POLICY_ELASTIC;
222		start_cpus();
223
224		break;
225	default:
226		cmn_err(CE_WARN, "Attempt to set unknown CPUPM policy %d\n",
227		    new_policy);
228		ASSERT(0);
229		break;
230	}
231	mutex_exit(&cpu_lock);
232
233	return (result);
234}
235
236/*
237 * Look for an existing power domain
238 */
239static cpupm_domain_t *
240cpupm_domain_find(id_t id, cpupm_dtype_t type)
241{
242	ASSERT(MUTEX_HELD(&cpu_lock));
243
244	cpupm_domain_t *dom;
245
246	dom = cpupm_domains;
247	while (dom != NULL) {
248		if (id == dom->cpd_id && type == dom->cpd_type)
249			return (dom);
250		dom = dom->cpd_next;
251	}
252	return (NULL);
253}
254
255/*
256 * Create a new domain
257 */
258static cpupm_domain_t *
259cpupm_domain_create(id_t id, cpupm_dtype_t type)
260{
261	cpupm_domain_t *dom;
262
263	ASSERT(MUTEX_HELD(&cpu_lock));
264
265	dom = kmem_zalloc(sizeof (cpupm_domain_t), KM_SLEEP);
266	dom->cpd_id = id;
267	dom->cpd_type = type;
268
269	/* Link into the known domain list */
270	dom->cpd_next = cpupm_domains;
271	cpupm_domains = dom;
272
273	return (dom);
274}
275
276static void
277cpupm_domain_state_enum(struct cpu *cp, cpupm_domain_t *dom)
278{
279	/*
280	 * In the envent we're enumerating because the domain's state
281	 * configuration has changed, toss any existing states.
282	 */
283	if (dom->cpd_nstates > 0) {
284		kmem_free(dom->cpd_states,
285		    sizeof (cpupm_state_t) * dom->cpd_nstates);
286		dom->cpd_nstates = 0;
287	}
288
289	/*
290	 * Query to determine the number of states, allocate storage
291	 * large enough to hold the state information, and pass it back
292	 * to the platform driver to complete the enumeration.
293	 */
294	dom->cpd_nstates = cpupm_plat_state_enumerate(cp, dom->cpd_type, NULL);
295
296	if (dom->cpd_nstates == 0)
297		return;
298
299	dom->cpd_states =
300	    kmem_zalloc(dom->cpd_nstates * sizeof (cpupm_state_t), KM_SLEEP);
301	(void) cpupm_plat_state_enumerate(cp, dom->cpd_type, dom->cpd_states);
302}
303
304/*
305 * Initialize the specified type of power domain on behalf of the CPU
306 */
307cpupm_domain_t *
308cpupm_domain_init(struct cpu *cp, cpupm_dtype_t type)
309{
310	cpupm_domain_t	*dom;
311	id_t		did;
312
313	ASSERT(MUTEX_HELD(&cpu_lock));
314
315	/*
316	 * Instantiate the domain if it doesn't already exist
317	 * and enumerate its power states.
318	 */
319	did = cpupm_domain_id(cp, type);
320	dom = cpupm_domain_find(did, type);
321	if (dom == NULL) {
322		dom = cpupm_domain_create(did, type);
323		cpupm_domain_state_enum(cp, dom);
324	}
325
326	/*
327	 * Named state initialization
328	 */
329	if (type == CPUPM_DTYPE_ACTIVE) {
330		/*
331		 * For active power domains, the highest performance
332		 * state is defined as first state returned from
333		 * the domain enumeration.
334		 */
335		dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
336		    &dom->cpd_states[0];
337		dom->cpd_named_states[CPUPM_STATE_LOW_POWER] =
338		    &dom->cpd_states[dom->cpd_nstates - 1];
339
340		/*
341		 * Begin by assuming CPU is running at the max perf state.
342		 */
343		dom->cpd_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
344	}
345
346	return (dom);
347}
348
349/*
350 * Return the id associated with the given type of domain
351 * to which cp belongs
352 */
353id_t
354cpupm_domain_id(struct cpu *cp, cpupm_dtype_t type)
355{
356	return (cpupm_plat_domain_id(cp, type));
357}
358
359/*
360 * Initiate a state change for the specified domain on behalf of cp
361 */
362int
363cpupm_change_state(struct cpu *cp, cpupm_domain_t *dom, cpupm_state_t *state)
364{
365	if (cpupm_plat_change_state(cp, state) < 0)
366		return (-1);
367
368	DTRACE_PROBE2(cpupm__change__state,
369	    cpupm_domain_t *, dom,
370	    cpupm_state_t *, state);
371
372	dom->cpd_state = state;
373	return (0);
374}
375
376/*
377 * Interface into the CPU power manager to indicate a significant change
378 * in utilization of the specified active power domain
379 */
380void
381cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom,
382			    cpupm_util_event_t event)
383{
384	cpupm_state_t	*new_state = NULL;
385	hrtime_t	last;
386
387	if (cpupm_policy == CPUPM_POLICY_DISABLED) {
388		return;
389	}
390
391	/*
392	 * What follows is a simple elastic power state management policy.
393	 *
394	 * If the utilization has become non-zero, and the domain was
395	 * previously at it's lowest power state, then transition it
396	 * to the highest state in the spirit of "race to idle".
397	 *
398	 * If the utilization has dropped to zero, then transition the
399	 * domain to its lowest power state.
400	 *
401	 * Statistics are maintained to implement governors to reduce state
402	 * transitions resulting from either transient work, or periods of
403	 * transient idleness on the domain.
404	 */
405	switch (event) {
406	case CPUPM_DOM_REMAIN_BUSY:
407
408		/*
409		 * We've received an event that the domain is running a thread
410		 * that's made it to the end of it's time slice. If we are at
411		 * low power, then raise it. If the transient work governor
412		 * is engaged, then remove it.
413		 */
414		if (dom->cpd_state ==
415		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
416			new_state =
417			    dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
418			if (dom->cpd_tw_governed == B_TRUE) {
419				dom->cpd_tw_governed = B_FALSE;
420				dom->cpd_tw = 0;
421			}
422		}
423		break;
424
425	case CPUPM_DOM_BUSY_FROM_IDLE:
426		last = dom->cpd_last_lower;
427		dom->cpd_last_raise = now;
428
429		DTRACE_PROBE3(cpupm__raise__req,
430		    cpupm_domain_t *, dom,
431		    hrtime_t, last,
432		    hrtime_t, now);
433
434		if (dom->cpd_state ==
435		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
436
437			/*
438			 * There's non-zero utilization, and the domain is
439			 * running in the lower power state. Before we
440			 * consider raising power, perform some book keeping
441			 * for the transient idle governor.
442			 */
443			if (dom->cpd_ti_governed == B_FALSE) {
444				if ((now - last) < cpupm_ti_predict_interval) {
445					/*
446					 * We're raising the domain power and
447					 * we *just* lowered it. Consider
448					 * this a mispredicted power state
449					 * transition due to a transient
450					 * idle period.
451					 */
452					if (++dom->cpd_ti >=
453					    cpupm_mispredict_thresh) {
454						/*
455						 * There's enough transient
456						 * idle transitions to
457						 * justify governing future
458						 * lowering requests.
459						 */
460						dom->cpd_ti_governed = B_TRUE;
461						dom->cpd_ti = 0;
462						DTRACE_PROBE1(
463						    cpupm__ti__governed,
464						    cpupm_domain_t *, dom);
465					}
466				} else {
467					/*
468					 * We correctly predicted the last
469					 * lowering.
470					 */
471					dom->cpd_ti = 0;
472				}
473			}
474			if (dom->cpd_tw_governed == B_TRUE) {
475				/*
476				 * Raise requests are governed due to
477				 * transient work.
478				 */
479				DTRACE_PROBE1(cpupm__raise__governed,
480				    cpupm_domain_t *, dom);
481
482				/*
483				 * It's likely that we'll be governed for a
484				 * while. If the transient idle governor is
485				 * also in place, examine the preceeding idle
486				 * interval to see if that still makes sense.
487				 */
488				if (dom->cpd_ti_governed == B_TRUE &&
489				    ((now - last) >=
490				    cpupm_ti_predict_interval)) {
491					if (++dom->cpd_ti >=
492					    cpupm_mispredict_gov_thresh) {
493						dom->cpd_ti_governed =
494						    B_FALSE;
495						dom->cpd_ti = 0;
496					}
497				}
498				return;
499			}
500			/*
501			 * Prepare to transition to the higher power state
502			 */
503			new_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
504
505		} else if (dom->cpd_state ==
506		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
507
508			/*
509			 * Utilization is non-zero, and we're already running
510			 * in the higher power state. Take this opportunity to
511			 * perform some book keeping if the last lowering
512			 * request was governed.
513			 */
514			if (dom->cpd_ti_governed == B_TRUE) {
515				if ((now - last) >= cpupm_ti_predict_interval) {
516					/*
517					 * The domain is transient idle
518					 * governed, and we mispredicted
519					 * governing the last lowering request.
520					 */
521					if (++dom->cpd_ti >=
522					    cpupm_mispredict_gov_thresh) {
523						/*
524						 * There's enough non-transient
525						 * idle periods to justify
526						 * removing the governor.
527						 */
528						dom->cpd_ti_governed = B_FALSE;
529						dom->cpd_ti = 0;
530						DTRACE_PROBE1(
531						    cpupm__ti__ungoverned,
532						    cpupm_domain_t *, dom);
533					}
534				} else {
535					/*
536					 * Correctly predicted governing the
537					 * last lowering request.
538					 */
539					dom->cpd_ti = 0;
540				}
541			}
542		}
543		break;
544
545	case CPUPM_DOM_IDLE_FROM_BUSY:
546		last = dom->cpd_last_raise;
547		dom->cpd_last_lower = now;
548
549		DTRACE_PROBE3(cpupm__lower__req,
550		    cpupm_domain_t *, dom,
551		    hrtime_t, last,
552		    hrtime_t, now);
553
554		if (dom->cpd_state ==
555		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
556
557			/*
558			 * The domain is idle, and is running in the highest
559			 * performance state. Before we consider lowering power,
560			 * perform some book keeping for the transient work
561			 * governor.
562			 */
563			if (dom->cpd_tw_governed == B_FALSE) {
564				if ((now - last) < cpupm_tw_predict_interval) {
565					/*
566					 * We're lowering the domain power and
567					 * we *just* raised it. Consider the
568					 * last raise mispredicted due to
569					 * transient work.
570					 */
571					if (++dom->cpd_tw >=
572					    cpupm_mispredict_thresh) {
573						/*
574						 * There's enough transient idle
575						 * transitions to justify
576						 * governing future lowering
577						 * requests.
578						 */
579						dom->cpd_tw_governed = B_TRUE;
580						dom->cpd_tw = 0;
581						DTRACE_PROBE1(
582						    cpupm__tw__governed,
583						    cpupm_domain_t *, dom);
584					}
585				} else {
586					/*
587					 * We correctly predicted during the
588					 * last raise.
589					 */
590					dom->cpd_tw = 0;
591				}
592			}
593			if (dom->cpd_ti_governed == B_TRUE) {
594				/*
595				 * Lowering requests are governed due to
596				 * transient idleness.
597				 */
598				DTRACE_PROBE1(cpupm__lowering__governed,
599				    cpupm_domain_t *, dom);
600
601				/*
602				 * It's likely that we'll be governed for a
603				 * while. If the transient work governor is
604				 * also in place, examine the preceeding busy
605				 * interval to see if that still makes sense.
606				 */
607				if (dom->cpd_tw_governed == B_TRUE &&
608				    ((now - last) >=
609				    cpupm_tw_predict_interval)) {
610					if (++dom->cpd_tw >=
611					    cpupm_mispredict_gov_thresh) {
612						dom->cpd_tw_governed =
613						    B_FALSE;
614						dom->cpd_tw = 0;
615					}
616				}
617				return;
618			}
619
620			/*
621			 * Prepare to transition to a lower power state.
622			 */
623			new_state =
624			    dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
625
626		} else if (dom->cpd_state ==
627		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
628
629			/*
630			 * The domain is idle, and we're already running in
631			 * the lower power state. Take this opportunity to
632			 * perform some book keeping if the last raising
633			 * request was governed.
634			 */
635			if (dom->cpd_tw_governed == B_TRUE) {
636				if ((now - last) >= cpupm_tw_predict_interval) {
637					/*
638					 * The domain is transient work
639					 * governed, and we mispredicted
640					 * governing the last raising request.
641					 */
642					if (++dom->cpd_tw >=
643					    cpupm_mispredict_gov_thresh) {
644						/*
645						 * There's enough non-transient
646						 * work to justify removing
647						 * the governor.
648						 */
649						dom->cpd_tw_governed = B_FALSE;
650						dom->cpd_tw = 0;
651						DTRACE_PROBE1(
652						    cpupm__tw__ungoverned,
653						    cpupm_domain_t *, dom);
654					}
655				} else {
656					/*
657					 * We correctly predicted governing
658					 * the last raise.
659					 */
660					dom->cpd_tw = 0;
661				}
662			}
663		}
664		break;
665	}
666	/*
667	 * Change the power state
668	 * Not much currently done if this doesn't succeed
669	 */
670	if (new_state)
671		(void) cpupm_change_state(cp, dom, new_state);
672}
673
674
675/*
676 * Interface called by platforms to dynamically change the
677 * MAX performance cpupm state
678 */
679void
680cpupm_redefine_max_activepwr_state(struct cpu *cp, int max_perf_level)
681{
682	cpupm_domain_t	*dom;
683	id_t		did;
684	cpupm_dtype_t	type = CPUPM_DTYPE_ACTIVE;
685	boolean_t	change_state = B_FALSE;
686	cpupm_state_t	*new_state = NULL;
687
688	did = cpupm_domain_id(cp, type);
689	mutex_enter(&cpu_lock);
690	dom = cpupm_domain_find(did, type);
691	mutex_exit(&cpu_lock);
692
693	/*
694	 * Can use a lock to avoid changing the power state of the cpu when
695	 * CPUPM_STATE_MAX_PERF is getting changed.
696	 * Since the occurance of events to change MAX_PERF is not frequent,
697	 * it may not be a good idea to overburden with locks. In the worst
698	 * case, for one cycle the power may not get changed to the required
699	 * level
700	 */
701	if (dom != NULL) {
702		if (dom->cpd_state ==
703		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
704			change_state = B_TRUE;
705		}
706
707		/*
708		 * If an out of range level is passed, use the lowest supported
709		 * speed.
710		 */
711		if (max_perf_level >= dom->cpd_nstates &&
712		    dom->cpd_nstates > 1) {
713			max_perf_level = dom->cpd_nstates - 1;
714		}
715
716		dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
717		    &dom->cpd_states[max_perf_level];
718
719		/*
720		 * If the current state is MAX_PERF, change the current state
721		 * to the new MAX_PERF
722		 */
723		if (change_state) {
724			new_state =
725			    dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
726			if (new_state) {
727				(void) cpupm_change_state(cp, dom, new_state);
728			}
729		}
730	}
731}
732
733/*
734 * Benchmark some power state transitions and use the transition latencies as
735 * a basis for initializing parameters for the transient idle and transient
736 * work governors.
737 *
738 * Returns 0 on success or -1 if the governor parameters could not be
739 * initialized.
740 */
741static int
742cpupm_governor_initialize(void)
743{
744	cpu_t		*cp = CPU;
745	cpupm_domain_t	*dom;
746	cpupm_state_t	*low, *high;
747	id_t		did;
748	hrtime_t	start, delta, deltas = 0;
749	int		iterations;
750
751	did = cpupm_domain_id(cp, CPUPM_DTYPE_ACTIVE);
752	if (did == CPUPM_NO_DOMAIN)
753		return (-1);
754
755	dom = cpupm_domain_find(did, CPUPM_DTYPE_ACTIVE);
756	if (dom == NULL)
757		return (-1);
758
759	low = dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
760	high = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
761
762	for (iterations = 0; iterations < CPUPM_BENCHMARK_ITERS; iterations++) {
763
764		/*
765		 * Measure the amount of time it takes to transition the
766		 * domain down to the lowest, and back to the highest power
767		 * state.
768		 */
769		start = gethrtime_unscaled();
770		(void) cpupm_change_state(cp, dom, low);
771		(void) cpupm_change_state(cp, dom, high);
772		delta = gethrtime_unscaled() - start;
773
774		DTRACE_PROBE1(cpupm__benchmark__latency,
775		    hrtime_t, delta);
776
777		deltas += delta;
778	}
779
780	/*
781	 * Figure the average latency, and tune the transient work and
782	 * transient idle prediction intervals accordingly.
783	 */
784	delta = deltas / iterations;
785
786	cpupm_ti_predict_interval = delta * cpupm_ti_gov_multiple;
787	cpupm_tw_predict_interval = delta * cpupm_tw_gov_multiple;
788
789	return (0);
790}
791
792/*
793 * Initiate a state change in all CPUPM domain instances of the specified type
794 */
795static void
796cpupm_state_change_global(cpupm_dtype_t type, cpupm_state_name_t state)
797{
798	cpu_t		*cp;
799	pg_cmt_t	*pwr_pg;
800	cpupm_domain_t	*dom;
801	group_t		*hwset;
802	group_iter_t	giter;
803	pg_cpu_itr_t	cpu_iter;
804	pghw_type_t	hw;
805
806	ASSERT(MUTEX_HELD(&cpu_lock));
807
808	switch (type) {
809	case CPUPM_DTYPE_ACTIVE:
810		hw = PGHW_POW_ACTIVE;
811		break;
812	default:
813		/*
814		 * Power domain types other than "active" unsupported.
815		 */
816		ASSERT(type == CPUPM_DTYPE_ACTIVE);
817		return;
818	}
819
820	if ((hwset = pghw_set_lookup(hw)) == NULL)
821		return;
822
823	/*
824	 * Iterate over the power domains
825	 */
826	group_iter_init(&giter);
827	while ((pwr_pg = group_iterate(hwset, &giter)) != NULL) {
828
829		dom = (cpupm_domain_t *)pwr_pg->cmt_pg.pghw_handle;
830
831		/*
832		 * Iterate over the CPUs in each domain
833		 */
834		PG_CPU_ITR_INIT(pwr_pg, cpu_iter);
835		while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
836			(void) cpupm_change_state(cp, dom,
837			    dom->cpd_named_states[state]);
838		}
839	}
840}
841