1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <stdio.h>
28#include <sys/types.h>
29#include <fcntl.h>
30#include <string.h>
31#include <stdlib.h>
32#include <unistd.h>
33#include <errno.h>
34
35#include <sys/stat.h>
36#include <poll.h>
37#include <signal.h>
38#include <pthread.h>
39#include <thread.h>
40#include <time.h>
41#include <sys/systeminfo.h>
42#include <sys/cred.h>
43#include <dirent.h>
44#include <libdevinfo.h>
45#include <sys/pm.h>
46#include <sys/ppmio.h>
47#include <locale.h>
48
49#include "fpsapi.h"
50#include "fpsd.h"
51#include "messages.h"
52
53
54#define	DEV_PM	"/devices/pseudo/pm@0:pm"
55#define	DEFAULT_CPU_FULL_POWER	3
56
57int  is_estar_system = 0;   /* Not an E* system, by default */
58int  sys_pm_state = PM_SYSTEM_PM_DISABLED; /* By default autopm disabled */
59
60
61static di_node_t  fps_di_root = DI_NODE_NIL;
62static di_prom_handle_t  fps_di_prom = DI_PROM_HANDLE_NIL;
63static char **cpu_dpaths = NULL;  /* Used only on E* system */
64static	int	*proc_ids = NULL;	/* Used only on E* system */
65static	int	num_cpus = 0;	/* Used only on E* system */
66static int  devpm_fd = -1;	/* Used only on E* system */
67static int  full_pwr = DEFAULT_CPU_FULL_POWER;
68
69/*
70 * Initialize system PM state enable/disable and
71 * enable system default info logging accordingly.
72 * Note: Even for systems for which CPU PM is not enabled by
73 * default, disk PM may be enabled explicitly using power.conf;
74 * If power management is enabled, disable informational logging
75 * by default.
76 *   Some platforms don't have /dev/pm entry. It is perfectly OK.
77 * Don't complain if there is no /dev/pm entry.
78 * The platforms on which CPU PM is enabled by default, would
79 * ofcourse have /dev/pm entry.
80 *
81 * Note: open_dev_pm() should have been called initially before
82 *       calling this function.
83 *
84 */
85
86void
87update_pm_state()
88{
89	int pm_stat;
90
91	if (devpm_fd == -1)
92		return;
93
94	pm_stat = ioctl(devpm_fd, PM_GET_PM_STATE);
95
96	if (pm_stat == -1)
97		return;
98
99	sys_pm_state = pm_stat;
100
101}
102
103/*
104 * Some platforms don't support power management. (neither CPU nor disk)
105 * Those platforms don't have /dev/pm entry. Don't complain in such case.
106 * Some platfors support PM only for disks. (they have /dev/pm entry.
107 * and logging is disabled on those platforms.)
108 * Some platforms support PM for both disks and CPUs (apart from others).
109 * Those platforms also have /dev/pm entry.
110 * Note that even desktops which support CPU PM E* can be custom
111 * configured to remove power management drivers. In that case,
112 * there won't be any /dev/pm entry and it is valid config.
113 *
114 */
115
116static  void  open_dev_pm()
117{
118	devpm_fd = open(DEV_PM, O_RDWR);
119
120}
121
122/*
123 * Initialize Estar info database.
124 *
125 */
126
127void
128init_estar_db()
129{
130	di_node_t  fnode, node;
131	di_prop_t  nextp;
132	char *path = NULL;
133	int cpu_i;
134	int  is_pmprop_found = 0;
135	pm_req_t  pmreq;
136	uchar_t  *prop_data = NULL;
137
138	/*
139	 * First open /dev/pm and keep it open for later uses.
140	 * Note that this needs to be open on all power management supported
141	 * systems. Some systems support power mgmt on only some
142	 * devices like disk, but not CPU. /dev/pm does not exist on
143	 * some platforms. Also PM drivers can be removed on custom
144	 * configurations.
145	 */
146	open_dev_pm();
147
148	if (devpm_fd == -1)
149		return;
150
151	fps_di_root = di_init("/", DINFOCPYALL);
152
153	if (DI_NODE_NIL == fps_di_root) {
154		fpsd_message(FPSD_EXIT_ERROR, FPS_WARNING, DI_INIT_FAIL);
155	}
156
157	fps_di_prom = di_prom_init();
158
159	if (DI_PROM_HANDLE_NIL == fps_di_prom) {
160		fpsd_message(FPSD_EXIT_ERROR, FPS_WARNING, DI_PROM_INIT_FAIL);
161		di_fini(fps_di_root);
162	}
163
164	if (di_prom_prop_lookup_bytes(fps_di_prom, fps_di_root,
165	    "energystar-v3", &prop_data) == -1)
166		goto exit_es;
167
168	/*
169	 * As a final check, also check for "us" driver property pm-components
170	 * On Estar systems, the driver should define this property.
171	 */
172
173	fnode = node = di_drv_first_node("us", fps_di_root);
174
175	if (DI_NODE_NIL == node) {
176		goto exit_es;
177	}
178
179	is_pmprop_found = 0;
180	for (nextp = di_prop_next(node, DI_PROP_NIL); nextp != DI_PROP_NIL;
181	    nextp = di_prop_next(node, nextp)) {
182		if (strcmp(di_prop_name(nextp), "pm-components") == 0) {
183			is_pmprop_found = 1;
184			break;
185		}
186	}
187
188	if (!is_pmprop_found)
189		goto exit_es;
190
191	is_estar_system = 1;  /* CPU power mgmt supported E* system */
192
193	num_cpus = 0;
194	while (node != DI_NODE_NIL) {
195		num_cpus++;
196		node = di_drv_next_node(node);
197	}
198
199	cpu_dpaths = (char **)calloc(num_cpus+1, sizeof (char *));
200	proc_ids = (int *)calloc(num_cpus+1, sizeof (int));
201	proc_ids[num_cpus] = -1;  /* Terminate processor ids by -1 */
202
203	cpu_i = 0;
204	for (node = fnode; node != DI_NODE_NIL; node = di_drv_next_node(node)) {
205		proc_ids[cpu_i] = -1;
206		cpu_dpaths[cpu_i] = NULL;
207
208		path = di_devfs_path(node);
209		if (NULL == path)
210			continue;
211		cpu_dpaths[cpu_i] = strdup(path);
212		di_devfs_path_free(path);
213		/*
214		 * Keep the mapping between path and processor IDs.
215		 * Currently, processor IDs are not used.
216		 * But may be used in future.
217		 */
218
219		/*
220		 * On workstation platforms (where CPU E* supported),
221		 * processor ID and instance numbers are same.
222		 * This may change in future. So watch out.
223		 */
224
225		proc_ids[cpu_i]  = di_instance(node); /* Currently unused. */
226		cpu_i++;
227	}
228
229	proc_ids[cpu_i] = -1;
230	cpu_dpaths[cpu_i] = NULL;
231
232	/* Initialize what "FULL POWER" mode is. */
233	full_pwr = DEFAULT_CPU_FULL_POWER;
234
235	pmreq.physpath = cpu_dpaths[0];
236	pmreq.component = 0;
237	pmreq.value = 0;
238	pmreq.data  = NULL;
239	pmreq.datasize  = 0;
240
241
242	full_pwr = ioctl(devpm_fd, PM_GET_FULL_POWER, &pmreq);
243	if (full_pwr == -1)
244		full_pwr = DEFAULT_CPU_FULL_POWER;
245exit_es:
246
247	if (fps_di_root != DI_NODE_NIL) {
248		di_fini(fps_di_root);
249		fps_di_root = DI_NODE_NIL;
250	}
251	if (DI_PROM_HANDLE_NIL != fps_di_prom) {
252		di_prom_fini(fps_di_prom);
253		fps_di_prom = DI_PROM_HANDLE_NIL;
254	}
255}
256
257/*
258 *  Return the min(idle_times), min(remaining_times), max(rem_time) for all
259 *  CPUs in full power mode. The "remain time" is the remaining
260 *  threshold time after which the CPU will make next lower level
261 *  power transition if left idle.
262 *  If the CPUs are not in full power mode or could not exactly determine
263 *  the power mode then return -1.
264 *  return 0 if CPUs are in full power mode.
265 */
266
267int
268get_idle_rem_stats(int *min_idle, int *min_rem, int *max_rem)
269{
270	int idle_time;
271	int pmstats[2];
272	int i;
273	pm_req_t  pmreq;
274	int ret;
275
276	*min_idle = -1;
277	*min_rem = -1;
278	*max_rem = -1;
279
280	for (i = 0; i < num_cpus; i++) {
281
282		pmreq.physpath = cpu_dpaths[i];
283		pmreq.component = 0;
284		pmreq.value = 0;
285		pmreq.data  = pmstats;
286		pmreq.datasize  = sizeof (pmstats);
287		idle_time = ioctl(devpm_fd, PM_GET_TIME_IDLE, &pmreq);
288		if (idle_time == -1)
289			continue;
290		ret = ioctl(devpm_fd, PM_GET_STATS, &pmreq);
291
292		/* Now pmstats[0] = cur power level; pmstats[1]=remain time */
293		if (ret == -1)
294			continue;
295		if (pmstats[0] != full_pwr)
296			continue;
297
298		if ((*min_idle == -1) || (idle_time < *min_idle))
299			*min_idle = idle_time;
300		if (*min_rem == -1 || pmstats[1] < *min_rem) {
301			*min_rem = pmstats[1];
302
303			/*
304			 * The remain time can be negative if there are 2 cpus
305			 * and 1 cpu is ready to transition
306			 * and the other one is not
307			 */
308			if (*min_rem < 0)
309				*min_rem = 0;
310		}
311		if (*max_rem == -1 || pmstats[1] > *max_rem)
312			*max_rem = pmstats[1];
313	}
314
315	return
316	    ((*min_idle == -1 || *min_rem == -1 || *max_rem == -1) ? -1 : 0);
317}
318
319/*
320 * Wait until CPU comes to full power state or timeout occurs.
321 * If multiple threads call this function, execute the
322 * PM ioctl system call only once.
323 * This is better than all 3 threads polling cpu pwr state same time.
324 *
325 * Callers of this function should not assume that on returning from
326 * this function CPU will be in full power state.
327 * (They should check again).
328 * This function just optimizes for performance during wait.
329 *
330 *
331 */
332
333void
334wait_for_pm_state_change()
335{
336	int res;
337	static pthread_mutex_t wrlck;
338	static int  is_active = 0;
339	static pm_req_t  pmreq;
340	static pm_state_change_t  pmsc;
341	static char  path[MAXPATHLEN];
342
343	int pwr = 0;
344	int cur_lvl = 0; /* 0 = unknown. 1=low, 3=full power */
345
346	pmreq.physpath = cpu_dpaths[0];
347	pmreq.component = 0;
348	pmreq.value = 0;
349	pmreq.data  = NULL;
350	pmreq.datasize  = 0;
351
352
353	(void) pthread_mutex_lock(&wrlck);
354
355	if (!is_active) {    /* This is the first thread trying to wait */
356		is_active = 1;
357		(void) pthread_mutex_unlock(&wrlck);
358
359		pmsc.physpath = path;
360		pmsc.size = MAXPATHLEN;
361		path[0] = 0; /* init not required. Just in case... */
362
363		/*
364		 * PM starts buffering the state changes after the first call to
365		 * PM_GET_STATE_CHANGE/PM_GET_STATE_CHANGE_WAIT
366		 *
367		 * The PM_GET_STATE_CHANGE is a non-blocking call where as
368		 * _WAIT is blocking call. The PM_GET_STATE_CHANGE also
369		 * returns all the info * about the latest buffered state
370		 * change if already buffered event is available. So it is
371		 * important to drain out all old events,
372		 * if you are only interested in future events.
373		 *
374		 * After the state changes the exact information/timestamp about
375		 * state changes are reflected in the ioctl struct.
376		 * To keep things simple, after draining out all buffered info,
377		 * we issue get current power to get the current power level and
378		 * then we issue another _WAIT command to get the
379		 * next power change.
380		 *
381		 */
382
383		do {
384
385			res =  ioctl(devpm_fd, PM_GET_STATE_CHANGE, &pmsc);
386
387			if (res == -1 && errno != EWOULDBLOCK) {
388				fpsd_message(FPSD_NO_EXIT, FPS_WARNING,
389				    INTERNAL_FAILURE_WARN,
390				    strerror(errno));
391				/* 1 second sleep. Avoid busy loop */
392				(void) poll(NULL, 0, 1000);
393				/* Probably will succeed in next call. */
394				goto psc_complete;
395			}
396
397		} while (errno != EWOULDBLOCK);
398
399		/* drain out all buffered state changes */
400
401		/* If current state is full power, then get out. */
402
403		do {
404			pwr = ioctl(devpm_fd, PM_GET_CURRENT_POWER, &pmreq);
405			if (pwr != -1) break;
406			if (errno == EAGAIN) {
407				(void) poll(NULL, 0, 1000);  /* 1 sec sleep */
408				continue;
409			} else {
410				fpsd_message(FPSD_NO_EXIT, FPS_WARNING,
411				    INTERNAL_FAILURE_WARN1,
412				    strerror(errno));
413				(void) poll(NULL, 0, 1000);  /* 1 sec sleep */
414				goto psc_complete;
415			}
416			/*CONSTCOND*/
417		} while (1);
418
419		if (pwr == full_pwr)
420			goto psc_complete;
421
422		while (cur_lvl != full_pwr) {
423			pmsc.physpath = path;
424			pmsc.size = MAXPATHLEN;
425			path[0] = 0; /* init not required. Just in case... */
426
427			do {
428				res = ioctl(devpm_fd,
429				    PM_GET_STATE_CHANGE_WAIT, &pmsc);
430				if (res == -1 && errno == EINTR) {
431					/* 1 second sleep */
432					(void) poll(NULL, 0, 1000);
433				}
434			} while (res == -1 && errno == EINTR);
435
436			if (res == -1) {
437				fpsd_message(FPSD_NO_EXIT, FPS_WARNING,
438				    INTERNAL_FAILURE_WARN2,
439				    strerror(errno));
440			/*
441			 * If there are failures in state change ioctl,
442			 * just would fall back to normal polling of
443			 * status later. get out quiet.
444			 */
445			/* avoid busy loop -- 1 second sleep */
446			(void) poll(NULL, 0, 1000);
447			goto psc_complete;
448		}
449
450		if (strcmp(pmsc.physpath, cpu_dpaths[0]) == 0 &&
451		    pmsc.new_level == full_pwr)
452			cur_lvl = full_pwr;
453		}
454
455psc_complete:
456		(void) pthread_mutex_lock(&wrlck);
457		is_active = 0;
458		(void) pthread_mutex_unlock(&wrlck);
459
460	} else {
461		/* Release the lock first */
462		(void) pthread_mutex_unlock(&wrlck);
463		/*
464		 * Already one other thread is active issuing ioctl call.
465		 * Just poll here to check the local flag without any expensive
466		 * ioctl calls until the transition is complete.
467		 */
468		(void) poll(NULL, 0, 1000); /* first time 1 second wait */
469		for (;;) {
470			(void) pthread_mutex_lock(&wrlck);
471			if (!is_active) {
472				(void) pthread_mutex_unlock(&wrlck);
473				break;
474			}
475			(void) pthread_mutex_unlock(&wrlck);
476			(void) poll(NULL, 0, 4000); /* 4 seconds wait */
477		}
478	}
479}
480