kern_cpu.c revision 141943
198937Sdes/*-
298937Sdes * Copyright (c) 2004-2005 Nate Lawson (SDG)
398937Sdes * All rights reserved.
498937Sdes *
598937Sdes * Redistribution and use in source and binary forms, with or without
698937Sdes * modification, are permitted provided that the following conditions
798937Sdes * are met:
898937Sdes * 1. Redistributions of source code must retain the above copyright
998937Sdes *    notice, this list of conditions and the following disclaimer.
1098937Sdes * 2. Redistributions in binary form must reproduce the above copyright
1198937Sdes *    notice, this list of conditions and the following disclaimer in the
1298937Sdes *    documentation and/or other materials provided with the distribution.
1398937Sdes *
1498937Sdes * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1598937Sdes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1698937Sdes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1798937Sdes * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
1898937Sdes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1998937Sdes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2098937Sdes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2198937Sdes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2298937Sdes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2398937Sdes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2498937Sdes * SUCH DAMAGE.
2598937Sdes */
2698937Sdes
2798937Sdes#include <sys/cdefs.h>
2898937Sdes__FBSDID("$FreeBSD: head/sys/kern/kern_cpu.c 141943 2005-02-15 07:22:42Z njl $");
2998937Sdes
3098937Sdes#include <sys/param.h>
3198937Sdes#include <sys/bus.h>
3298937Sdes#include <sys/cpu.h>
3398937Sdes#include <sys/eventhandler.h>
3498937Sdes#include <sys/kernel.h>
3598937Sdes#include <sys/malloc.h>
3698937Sdes#include <sys/module.h>
3798937Sdes#include <sys/proc.h>
3898937Sdes#include <sys/queue.h>
3998937Sdes#include <sys/sched.h>
4098937Sdes#include <sys/sysctl.h>
4198937Sdes#include <sys/systm.h>
4298937Sdes#include <sys/sbuf.h>
4398937Sdes#include <sys/timetc.h>
4498937Sdes
4598937Sdes#include "cpufreq_if.h"
4698937Sdes
4798937Sdes/*
4898937Sdes * Common CPU frequency glue code.  Drivers for specific hardware can
4998937Sdes * attach this interface to allow users to get/set the CPU frequency.
5098937Sdes */
5198937Sdes
5298937Sdes/*
5398937Sdes * Number of levels we can handle.  Levels are synthesized from settings
5498937Sdes * so for N settings there may be N^2 levels.
5598937Sdes */
5698937Sdes#define CF_MAX_LEVELS	32
5798937Sdes
5898937Sdesstruct cpufreq_softc {
5998937Sdes	struct cf_level			curr_level;
6098937Sdes	int				curr_priority;
6198937Sdes	struct cf_level			saved_level;
6298937Sdes	int				saved_priority;
6398937Sdes	struct cf_level_lst		all_levels;
6498937Sdes	int				all_count;
6598937Sdes	device_t			dev;
6698937Sdes	struct sysctl_ctx_list		sysctl_ctx;
6798937Sdes};
6898937Sdes
6998937Sdesstruct cf_setting_array {
7098937Sdes	struct cf_setting		sets[MAX_SETTINGS];
7198937Sdes	int				count;
7298937Sdes	TAILQ_ENTRY(cf_setting_array)	link;
7398937Sdes};
7498937Sdes
7598937SdesTAILQ_HEAD(cf_setting_lst, cf_setting_array);
7698937Sdes
7798937Sdesstatic int	cpufreq_attach(device_t dev);
7898937Sdesstatic int	cpufreq_detach(device_t dev);
7998937Sdesstatic void	cpufreq_evaluate(void *arg);
8098937Sdesstatic int	cf_set_method(device_t dev, const struct cf_level *level,
8198937Sdes		    int priority);
8298937Sdesstatic int	cf_get_method(device_t dev, struct cf_level *level);
8398937Sdesstatic int	cf_levels_method(device_t dev, struct cf_level *levels,
8498937Sdes		    int *count);
8598937Sdesstatic int	cpufreq_insert_abs(struct cpufreq_softc *sc,
8698937Sdes		    struct cf_setting *sets, int count);
8798937Sdesstatic int	cpufreq_expand_set(struct cpufreq_softc *sc,
8898937Sdes		    struct cf_setting_array *set_arr);
8998937Sdesstatic struct cf_level *cpufreq_dup_set(struct cpufreq_softc *sc,
9098937Sdes		    struct cf_level *dup, struct cf_setting *set);
9198937Sdesstatic int	cpufreq_curr_sysctl(SYSCTL_HANDLER_ARGS);
9298937Sdesstatic int	cpufreq_levels_sysctl(SYSCTL_HANDLER_ARGS);
9398937Sdes
9498937Sdesstatic device_method_t cpufreq_methods[] = {
9598937Sdes	DEVMETHOD(device_probe,		bus_generic_probe),
9698937Sdes	DEVMETHOD(device_attach,	cpufreq_attach),
9798937Sdes	DEVMETHOD(device_detach,	cpufreq_detach),
9898937Sdes
9998937Sdes        DEVMETHOD(cpufreq_set,		cf_set_method),
10098937Sdes        DEVMETHOD(cpufreq_get,		cf_get_method),
10198937Sdes        DEVMETHOD(cpufreq_levels,	cf_levels_method),
10298937Sdes	{0, 0}
10398937Sdes};
10498937Sdesstatic driver_t cpufreq_driver = {
10598937Sdes	"cpufreq", cpufreq_methods, sizeof(struct cpufreq_softc)
10698937Sdes};
10798937Sdesstatic devclass_t cpufreq_dc;
10898937SdesDRIVER_MODULE(cpufreq, cpu, cpufreq_driver, cpufreq_dc, 0, 0);
10998937Sdes
11098937Sdesstatic eventhandler_tag cf_ev_tag;
11198937Sdes
11298937Sdesstatic int
11398937Sdescpufreq_attach(device_t dev)
11498937Sdes{
11598937Sdes	struct cpufreq_softc *sc;
11698937Sdes	device_t parent;
11798937Sdes	int numdevs;
11898937Sdes
11998937Sdes	sc = device_get_softc(dev);
12098937Sdes	parent = device_get_parent(dev);
12198937Sdes	sc->dev = dev;
12298937Sdes	sysctl_ctx_init(&sc->sysctl_ctx);
12398937Sdes	TAILQ_INIT(&sc->all_levels);
12498937Sdes	sc->curr_level.total_set.freq = CPUFREQ_VAL_UNKNOWN;
12598937Sdes	sc->saved_level.total_set.freq = CPUFREQ_VAL_UNKNOWN;
12698937Sdes
12798937Sdes	/*
12898937Sdes	 * Only initialize one set of sysctls for all CPUs.  In the future,
12998937Sdes	 * if multiple CPUs can have different settings, we can move these
13098937Sdes	 * sysctls to be under every CPU instead of just the first one.
13198937Sdes	 */
13298937Sdes	numdevs = devclass_get_count(cpufreq_dc);
13398937Sdes	if (numdevs > 1)
13498937Sdes		return (0);
13598937Sdes
13698937Sdes	SYSCTL_ADD_PROC(&sc->sysctl_ctx,
13798937Sdes	    SYSCTL_CHILDREN(device_get_sysctl_tree(parent)),
13898937Sdes	    OID_AUTO, "freq", CTLTYPE_INT | CTLFLAG_RW, sc, 0,
13998937Sdes	    cpufreq_curr_sysctl, "I", "Current CPU frequency");
14098937Sdes	SYSCTL_ADD_PROC(&sc->sysctl_ctx,
14198937Sdes	    SYSCTL_CHILDREN(device_get_sysctl_tree(parent)),
14298937Sdes	    OID_AUTO, "freq_levels", CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
14398937Sdes	    cpufreq_levels_sysctl, "A", "CPU frequency levels");
14498937Sdes	cf_ev_tag = EVENTHANDLER_REGISTER(cpufreq_changed, cpufreq_evaluate,
14598937Sdes	    NULL, EVENTHANDLER_PRI_ANY);
14698937Sdes
14798937Sdes	return (0);
14898937Sdes}
14998937Sdes
15098937Sdesstatic int
15198937Sdescpufreq_detach(device_t dev)
15298937Sdes{
15398937Sdes	struct cpufreq_softc *sc;
15498937Sdes	int numdevs;
15598937Sdes
15698937Sdes	sc = device_get_softc(dev);
15798937Sdes	sysctl_ctx_free(&sc->sysctl_ctx);
15898937Sdes
15998937Sdes	/* Only clean up these resources when the last device is detaching. */
16098937Sdes	numdevs = devclass_get_count(cpufreq_dc);
16198937Sdes	if (numdevs == 1)
16298937Sdes		EVENTHANDLER_DEREGISTER(cpufreq_changed, cf_ev_tag);
16398937Sdes
16498937Sdes	return (0);
16598937Sdes}
16698937Sdes
16798937Sdesstatic void
16898937Sdescpufreq_evaluate(void *arg)
16998937Sdes{
17098937Sdes	/* TODO: Re-evaluate when notified of changes to drivers. */
17198937Sdes}
17298937Sdes
17398937Sdesstatic int
17498937Sdescf_set_method(device_t dev, const struct cf_level *level, int priority)
17598937Sdes{
17698937Sdes	struct cpufreq_softc *sc;
17798937Sdes	const struct cf_setting *set;
17898937Sdes	struct pcpu *pc;
17998937Sdes	int cpu_id, error, i;
18098937Sdes
18198937Sdes	sc = device_get_softc(dev);
18298937Sdes
18398937Sdes	/*
184	 * Check that the TSC isn't being used as a timecounter.
185	 * If it is, then return EBUSY and refuse to change the
186	 * clock speed.
187	 */
188	if (strcmp(timecounter->tc_name, "TSC") == 0)
189		return (EBUSY);
190
191	/*
192	 * If the caller didn't specify a level and one is saved, prepare to
193	 * restore the saved level.  If none has been saved, return an error.
194	 * If they did specify one, but the requested level has a lower
195	 * priority, don't allow the new level right now.
196	 */
197	if (level == NULL) {
198		if (sc->saved_level.total_set.freq != CPUFREQ_VAL_UNKNOWN) {
199			level = &sc->saved_level;
200			priority = sc->saved_priority;
201		} else
202			return (ENXIO);
203	} else if (priority < sc->curr_priority)
204		return (EPERM);
205
206	/* If already at this level, just return. */
207	if (CPUFREQ_CMP(sc->curr_level.total_set.freq, level->total_set.freq))
208		return (0);
209
210	/* First, set the absolute frequency via its driver. */
211	set = &level->abs_set;
212	if (set->dev) {
213		if (!device_is_attached(set->dev)) {
214			error = ENXIO;
215			goto out;
216		}
217
218		/* Bind to the target CPU before switching, if necessary. */
219		cpu_id = PCPU_GET(cpuid);
220		pc = cpu_get_pcpu(set->dev);
221		if (cpu_id != pc->pc_cpuid) {
222			mtx_lock_spin(&sched_lock);
223			sched_bind(curthread, pc->pc_cpuid);
224			mtx_unlock_spin(&sched_lock);
225		}
226		error = CPUFREQ_DRV_SET(set->dev, set);
227		if (cpu_id != pc->pc_cpuid) {
228			mtx_lock_spin(&sched_lock);
229			sched_unbind(curthread);
230			mtx_unlock_spin(&sched_lock);
231		}
232		if (error) {
233			goto out;
234		}
235	}
236
237	/* Next, set any/all relative frequencies via their drivers. */
238	for (i = 0; i < level->rel_count; i++) {
239		set = &level->rel_set[i];
240		if (!device_is_attached(set->dev)) {
241			error = ENXIO;
242			goto out;
243		}
244
245		/* Bind to the target CPU before switching, if necessary. */
246		cpu_id = PCPU_GET(cpuid);
247		pc = cpu_get_pcpu(set->dev);
248		if (cpu_id != pc->pc_cpuid) {
249			mtx_lock_spin(&sched_lock);
250			sched_bind(curthread, pc->pc_cpuid);
251			mtx_unlock_spin(&sched_lock);
252		}
253		error = CPUFREQ_DRV_SET(set->dev, set);
254		if (cpu_id != pc->pc_cpuid) {
255			mtx_lock_spin(&sched_lock);
256			sched_unbind(curthread);
257			mtx_unlock_spin(&sched_lock);
258		}
259		if (error) {
260			/* XXX Back out any successful setting? */
261			goto out;
262		}
263	}
264
265	/* If we were restoring a saved state, reset it to "unused". */
266	if (level == &sc->saved_level) {
267		sc->saved_level.total_set.freq = CPUFREQ_VAL_UNKNOWN;
268		sc->saved_priority = 0;
269	}
270
271	/*
272	 * Before recording the current level, check if we're going to a
273	 * higher priority and have not saved a level yet.  If so, save the
274	 * previous level and priority.
275	 */
276	if (sc->curr_level.total_set.freq != CPUFREQ_VAL_UNKNOWN &&
277	    sc->saved_level.total_set.freq == CPUFREQ_VAL_UNKNOWN &&
278	    priority > sc->curr_priority) {
279		sc->saved_level = sc->curr_level;
280		sc->saved_priority = sc->curr_priority;
281	}
282	sc->curr_level = *level;
283	sc->curr_priority = priority;
284	error = 0;
285
286out:
287	if (error)
288		device_printf(set->dev, "set freq failed, err %d\n", error);
289	return (error);
290}
291
292static int
293cf_get_method(device_t dev, struct cf_level *level)
294{
295	struct cpufreq_softc *sc;
296	struct cf_level *levels;
297	struct cf_setting *curr_set, set;
298	struct pcpu *pc;
299	device_t *devs;
300	int count, error, i, numdevs;
301	uint64_t rate;
302
303	sc = device_get_softc(dev);
304	curr_set = &sc->curr_level.total_set;
305	levels = NULL;
306
307	/* If we already know the current frequency, we're done. */
308	if (curr_set->freq != CPUFREQ_VAL_UNKNOWN)
309		goto out;
310
311	/*
312	 * We need to figure out the current level.  Loop through every
313	 * driver, getting the current setting.  Then, attempt to get a best
314	 * match of settings against each level.
315	 */
316	count = CF_MAX_LEVELS;
317	levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT);
318	if (levels == NULL)
319		return (ENOMEM);
320	error = CPUFREQ_LEVELS(sc->dev, levels, &count);
321	if (error)
322		goto out;
323	error = device_get_children(device_get_parent(dev), &devs, &numdevs);
324	if (error)
325		goto out;
326	for (i = 0; i < numdevs && curr_set->freq == CPUFREQ_VAL_UNKNOWN; i++) {
327		if (!device_is_attached(devs[i]))
328			continue;
329		error = CPUFREQ_DRV_GET(devs[i], &set);
330		if (error)
331			continue;
332		for (i = 0; i < count; i++) {
333			if (CPUFREQ_CMP(set.freq, levels[i].total_set.freq)) {
334				sc->curr_level = levels[i];
335				break;
336			}
337		}
338	}
339	free(devs, M_TEMP);
340	if (curr_set->freq != CPUFREQ_VAL_UNKNOWN)
341		goto out;
342
343	/*
344	 * We couldn't find an exact match, so attempt to estimate and then
345	 * match against a level.
346	 */
347	pc = cpu_get_pcpu(dev);
348	if (pc == NULL) {
349		error = ENXIO;
350		goto out;
351	}
352	cpu_est_clockrate(pc->pc_cpuid, &rate);
353	rate /= 1000000;
354	for (i = 0; i < count; i++) {
355		if (CPUFREQ_CMP(rate, levels[i].total_set.freq)) {
356			sc->curr_level = levels[i];
357			break;
358		}
359	}
360
361out:
362	if (levels)
363		free(levels, M_TEMP);
364	*level = sc->curr_level;
365	return (0);
366}
367
368static int
369cf_levels_method(device_t dev, struct cf_level *levels, int *count)
370{
371	struct cf_setting_array *set_arr;
372	struct cf_setting_lst rel_sets;
373	struct cpufreq_softc *sc;
374	struct cf_level *lev;
375	struct cf_setting *sets;
376	struct pcpu *pc;
377	device_t *devs;
378	int error, i, numdevs, set_count, type;
379	uint64_t rate;
380
381	if (levels == NULL || count == NULL)
382		return (EINVAL);
383
384	TAILQ_INIT(&rel_sets);
385	sc = device_get_softc(dev);
386	error = device_get_children(device_get_parent(dev), &devs, &numdevs);
387	if (error)
388		return (error);
389	sets = malloc(MAX_SETTINGS * sizeof(*sets), M_TEMP, M_NOWAIT);
390	if (sets == NULL) {
391		free(devs, M_TEMP);
392		return (ENOMEM);
393	}
394
395	/* Get settings from all cpufreq drivers. */
396	for (i = 0; i < numdevs; i++) {
397		/* Skip devices that aren't ready. */
398		if (!device_is_attached(devs[i]))
399			continue;
400
401		/*
402		 * Get settings, skipping drivers that offer no settings or
403		 * provide settings for informational purposes only.
404		 */
405		set_count = MAX_SETTINGS;
406		error = CPUFREQ_DRV_SETTINGS(devs[i], sets, &set_count, &type);
407		if (error || set_count == 0 || (type & CPUFREQ_FLAG_INFO_ONLY))
408			continue;
409
410		/* Add the settings to our absolute/relative lists. */
411		switch (type & CPUFREQ_TYPE_MASK) {
412		case CPUFREQ_TYPE_ABSOLUTE:
413			error = cpufreq_insert_abs(sc, sets, set_count);
414			break;
415		case CPUFREQ_TYPE_RELATIVE:
416			set_arr = malloc(sizeof(*set_arr), M_TEMP, M_NOWAIT);
417			if (set_arr == NULL) {
418				error = ENOMEM;
419				goto out;
420			}
421			bcopy(sets, set_arr->sets, set_count * sizeof(*sets));
422			set_arr->count = set_count;
423			TAILQ_INSERT_TAIL(&rel_sets, set_arr, link);
424			break;
425		default:
426			error = EINVAL;
427			break;
428		}
429		if (error)
430			goto out;
431	}
432
433	/* If there are no absolute levels, create a fake one at 100%. */
434	if (TAILQ_EMPTY(&sc->all_levels)) {
435		bzero(&sets[0], sizeof(*sets));
436		pc = cpu_get_pcpu(dev);
437		if (pc == NULL) {
438			error = ENXIO;
439			goto out;
440		}
441		cpu_est_clockrate(pc->pc_cpuid, &rate);
442		sets[0].freq = rate / 1000000;
443		error = cpufreq_insert_abs(sc, sets, 1);
444		if (error)
445			goto out;
446	}
447
448	/* Create a combined list of absolute + relative levels. */
449	TAILQ_FOREACH(set_arr, &rel_sets, link)
450		cpufreq_expand_set(sc, set_arr);
451
452	/* If the caller doesn't have enough space, return the actual count. */
453	if (sc->all_count > *count) {
454		*count = sc->all_count;
455		error = E2BIG;
456		goto out;
457	}
458
459	/* Finally, output the list of levels. */
460	i = 0;
461	TAILQ_FOREACH(lev, &sc->all_levels, link) {
462		levels[i] = *lev;
463		i++;
464	}
465	*count = sc->all_count;
466	error = 0;
467
468out:
469	/* Clear all levels since we regenerate them each time. */
470	while ((lev = TAILQ_FIRST(&sc->all_levels)) != NULL) {
471		TAILQ_REMOVE(&sc->all_levels, lev, link);
472		free(lev, M_TEMP);
473	}
474	while ((set_arr = TAILQ_FIRST(&rel_sets)) != NULL) {
475		TAILQ_REMOVE(&rel_sets, set_arr, link);
476		free(set_arr, M_TEMP);
477	}
478	sc->all_count = 0;
479	free(devs, M_TEMP);
480	free(sets, M_TEMP);
481	return (error);
482}
483
484/*
485 * Create levels for an array of absolute settings and insert them in
486 * sorted order in the specified list.
487 */
488static int
489cpufreq_insert_abs(struct cpufreq_softc *sc, struct cf_setting *sets,
490    int count)
491{
492	struct cf_level_lst *list;
493	struct cf_level *level, *search;
494	int i;
495
496	list = &sc->all_levels;
497	for (i = 0; i < count; i++) {
498		level = malloc(sizeof(*level), M_TEMP, M_NOWAIT | M_ZERO);
499		if (level == NULL)
500			return (ENOMEM);
501		level->abs_set = sets[i];
502		level->total_set = sets[i];
503		level->total_set.dev = NULL;
504		sc->all_count++;
505
506		if (TAILQ_EMPTY(list)) {
507			TAILQ_INSERT_HEAD(list, level, link);
508			continue;
509		}
510
511		TAILQ_FOREACH_REVERSE(search, list, cf_level_lst, link) {
512			if (sets[i].freq <= search->total_set.freq) {
513				TAILQ_INSERT_AFTER(list, search, level, link);
514				break;
515			}
516		}
517	}
518	return (0);
519}
520
521/*
522 * Expand a group of relative settings, creating derived levels from them.
523 */
524static int
525cpufreq_expand_set(struct cpufreq_softc *sc, struct cf_setting_array *set_arr)
526{
527	struct cf_level *fill, *search;
528	struct cf_setting *set;
529	int i;
530
531	TAILQ_FOREACH(search, &sc->all_levels, link) {
532		/* Skip this level if we've already modified it. */
533		for (i = 0; i < search->rel_count; i++) {
534			if (search->rel_set[i].dev == set_arr->sets[0].dev)
535				break;
536		}
537		if (i != search->rel_count)
538			continue;
539
540		/* Add each setting to the level, duplicating if necessary. */
541		for (i = 0; i < set_arr->count; i++) {
542			set = &set_arr->sets[i];
543
544			/*
545			 * If this setting is less than 100%, split the level
546			 * into two and add this setting to the new level.
547			 */
548			fill = search;
549			if (set->freq < 10000)
550				fill = cpufreq_dup_set(sc, search, set);
551
552			/*
553			 * The new level was a duplicate of an existing level
554			 * so we freed it.  Go to the next setting.
555			 */
556			if (fill == NULL)
557				continue;
558
559			/* Add this setting to the existing or new level. */
560			KASSERT(fill->rel_count < MAX_SETTINGS,
561			    ("cpufreq: too many relative drivers (%d)",
562			    MAX_SETTINGS));
563			fill->rel_set[fill->rel_count] = *set;
564			fill->rel_count++;
565		}
566	}
567
568	return (0);
569}
570
571static struct cf_level *
572cpufreq_dup_set(struct cpufreq_softc *sc, struct cf_level *dup,
573    struct cf_setting *set)
574{
575	struct cf_level_lst *list;
576	struct cf_level *fill, *itr;
577	struct cf_setting *fill_set, *itr_set;
578	int i;
579
580	/*
581	 * Create a new level, copy it from the old one, and update the
582	 * total frequency and power by the percentage specified in the
583	 * relative setting.
584	 */
585	fill = malloc(sizeof(*fill), M_TEMP, M_NOWAIT);
586	if (fill == NULL)
587		return (NULL);
588	*fill = *dup;
589	fill_set = &fill->total_set;
590	fill_set->freq =
591	    ((uint64_t)fill_set->freq * set->freq) / 10000;
592	if (fill_set->power != CPUFREQ_VAL_UNKNOWN) {
593		fill_set->power = ((uint64_t)fill_set->power * set->freq)
594		    / 10000;
595	}
596	if (set->lat != CPUFREQ_VAL_UNKNOWN) {
597		if (fill_set->lat != CPUFREQ_VAL_UNKNOWN)
598			fill_set->lat += set->lat;
599		else
600			fill_set->lat = set->lat;
601	}
602
603	/*
604	 * If we copied an old level that we already modified (say, at 100%),
605	 * we need to remove that setting before adding this one.  Since we
606	 * process each setting array in order, we know any settings for this
607	 * driver will be found at the end.
608	 */
609	for (i = fill->rel_count; i != 0; i--) {
610		if (fill->rel_set[i - 1].dev != set->dev)
611			break;
612		fill->rel_count--;
613	}
614
615	/*
616	 * Insert the new level in sorted order.  If we find a duplicate,
617	 * free the new level.  We can do this since any existing level will
618	 * be guaranteed to have the same or less settings and thus consume
619	 * less power.  For example, a level with one absolute setting of
620	 * 800 Mhz uses less power than one composed of an absolute setting
621	 * of 1600 Mhz and a relative setting at 50%.
622	 */
623	list = &sc->all_levels;
624	if (TAILQ_EMPTY(list)) {
625		TAILQ_INSERT_HEAD(list, fill, link);
626	} else {
627		TAILQ_FOREACH_REVERSE(itr, list, cf_level_lst, link) {
628			itr_set = &itr->total_set;
629			if (CPUFREQ_CMP(fill_set->freq, itr_set->freq)) {
630				free(fill, M_TEMP);
631				fill = NULL;
632				break;
633			} else if (fill_set->freq < itr_set->freq) {
634				TAILQ_INSERT_AFTER(list, itr, fill, link);
635				sc->all_count++;
636				break;
637			}
638		}
639	}
640
641	return (fill);
642}
643
644static int
645cpufreq_curr_sysctl(SYSCTL_HANDLER_ARGS)
646{
647	struct cpufreq_softc *sc;
648	struct cf_level *levels;
649	int count, devcount, error, freq, i, n;
650	device_t *devs;
651
652	devs = NULL;
653	sc = oidp->oid_arg1;
654	levels = malloc(CF_MAX_LEVELS * sizeof(*levels), M_TEMP, M_NOWAIT);
655	if (levels == NULL)
656		return (ENOMEM);
657
658	error = CPUFREQ_GET(sc->dev, &levels[0]);
659	if (error)
660		goto out;
661	freq = levels[0].total_set.freq;
662	error = sysctl_handle_int(oidp, &freq, 0, req);
663	if (error != 0 || req->newptr == NULL)
664		goto out;
665
666	/*
667	 * While we only call cpufreq_get() on one device (assuming all
668	 * CPUs have equal levels), we call cpufreq_set() on all CPUs.
669	 * This is needed for some MP systems.
670	 */
671	error = devclass_get_devices(cpufreq_dc, &devs, &devcount);
672	if (error)
673		goto out;
674	for (n = 0; n < devcount; n++) {
675		count = CF_MAX_LEVELS;
676		error = CPUFREQ_LEVELS(devs[n], levels, &count);
677		if (error)
678			break;
679		for (i = 0; i < count; i++) {
680			if (CPUFREQ_CMP(levels[i].total_set.freq, freq)) {
681				error = CPUFREQ_SET(devs[n], &levels[i],
682				    CPUFREQ_PRIO_USER);
683				break;
684			}
685		}
686		if (i == count) {
687			error = EINVAL;
688			break;
689		}
690	}
691
692out:
693	if (devs)
694		free(devs, M_TEMP);
695	if (levels)
696		free(levels, M_TEMP);
697	return (error);
698}
699
700static int
701cpufreq_levels_sysctl(SYSCTL_HANDLER_ARGS)
702{
703	struct cpufreq_softc *sc;
704	struct cf_level *levels;
705	struct cf_setting *set;
706	struct sbuf sb;
707	int count, error, i;
708
709	sc = oidp->oid_arg1;
710	sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND);
711
712	/* Get settings from the device and generate the output string. */
713	count = CF_MAX_LEVELS;
714	levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT);
715	if (levels == NULL)
716		return (ENOMEM);
717	error = CPUFREQ_LEVELS(sc->dev, levels, &count);
718	if (error)
719		goto out;
720	if (count) {
721		for (i = 0; i < count; i++) {
722			set = &levels[i].total_set;
723			sbuf_printf(&sb, "%d/%d ", set->freq, set->power);
724		}
725	} else
726		sbuf_cpy(&sb, "0");
727	sbuf_trim(&sb);
728	sbuf_finish(&sb);
729	error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
730
731out:
732	free(levels, M_TEMP);
733	sbuf_delete(&sb);
734	return (error);
735}
736
737int
738cpufreq_register(device_t dev)
739{
740	device_t cf_dev, cpu_dev;
741
742	/*
743	 * Add only one cpufreq device to each CPU.  Currently, all CPUs
744	 * must offer the same levels and be switched at the same time.
745	 */
746	cpu_dev = device_get_parent(dev);
747	if (device_find_child(cpu_dev, "cpufreq", -1))
748		return (0);
749
750	/* Add the child device and possibly sysctls. */
751	cf_dev = BUS_ADD_CHILD(cpu_dev, 0, "cpufreq", -1);
752	if (cf_dev == NULL)
753		return (ENOMEM);
754	device_quiet(cf_dev);
755
756	return (device_probe_and_attach(cf_dev));
757}
758
759int
760cpufreq_unregister(device_t dev)
761{
762	device_t cf_dev, *devs;
763	int cfcount, count, devcount, error, i, type;
764	struct cf_setting set;
765
766	/*
767	 * If this is the last cpufreq child device, remove the control
768	 * device as well.  We identify cpufreq children by calling a method
769	 * they support.
770	 */
771	error = device_get_children(device_get_parent(dev), &devs, &devcount);
772	if (error)
773		return (error);
774	cf_dev = devclass_get_device(cpufreq_dc, 0);
775	cfcount = 0;
776	for (i = 0; i < devcount; i++) {
777		if (!device_is_attached(devs[i]))
778			continue;
779		count = 1;
780		if (CPUFREQ_DRV_SETTINGS(devs[i], &set, &count, &type) == 0)
781			cfcount++;
782	}
783	if (cfcount <= 1)
784		device_delete_child(device_get_parent(cf_dev), cf_dev);
785	free(devs, M_TEMP);
786
787	return (0);
788}
789