kern_cpu.c revision 141824
1141240Snjl/*-
2141240Snjl * Copyright (c) 2004-2005 Nate Lawson (SDG)
3141240Snjl * All rights reserved.
4141240Snjl *
5141240Snjl * Redistribution and use in source and binary forms, with or without
6141240Snjl * modification, are permitted provided that the following conditions
7141240Snjl * are met:
8141240Snjl * 1. Redistributions of source code must retain the above copyright
9141240Snjl *    notice, this list of conditions and the following disclaimer.
10141240Snjl * 2. Redistributions in binary form must reproduce the above copyright
11141240Snjl *    notice, this list of conditions and the following disclaimer in the
12141240Snjl *    documentation and/or other materials provided with the distribution.
13141240Snjl *
14141240Snjl * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15141240Snjl * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16141240Snjl * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17141240Snjl * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18141240Snjl * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19141240Snjl * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20141240Snjl * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21141240Snjl * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22141240Snjl * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23141240Snjl * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24141240Snjl * SUCH DAMAGE.
25141240Snjl */
26141240Snjl
27141240Snjl#include <sys/cdefs.h>
28141240Snjl__FBSDID("$FreeBSD: head/sys/kern/kern_cpu.c 141824 2005-02-13 18:49:48Z njl $");
29141240Snjl
30141240Snjl#include <sys/param.h>
31141240Snjl#include <sys/bus.h>
32141240Snjl#include <sys/cpu.h>
33141240Snjl#include <sys/eventhandler.h>
34141240Snjl#include <sys/kernel.h>
35141240Snjl#include <sys/malloc.h>
36141240Snjl#include <sys/module.h>
37141240Snjl#include <sys/proc.h>
38141240Snjl#include <sys/queue.h>
39141240Snjl#include <sys/sched.h>
40141240Snjl#include <sys/sysctl.h>
41141240Snjl#include <sys/systm.h>
42141240Snjl#include <sys/sbuf.h>
43141814Snjl#include <sys/timetc.h>
44141240Snjl
45141240Snjl#include "cpufreq_if.h"
46141240Snjl
47141240Snjl/*
48141240Snjl * Common CPU frequency glue code.  Drivers for specific hardware can
49141240Snjl * attach this interface to allow users to get/set the CPU frequency.
50141240Snjl */
51141240Snjl
52141240Snjl/*
53141240Snjl * Number of levels we can handle.  Levels are synthesized from settings
54141240Snjl * so for N settings there may be N^2 levels.
55141240Snjl */
56141240Snjl#define CF_MAX_LEVELS	32
57141240Snjl
58141240Snjlstruct cpufreq_softc {
59141240Snjl	struct cf_level			curr_level;
60141240Snjl	int				priority;
61141413Snjl	int				all_count;
62141240Snjl	struct cf_level_lst		all_levels;
63141240Snjl	device_t			dev;
64141240Snjl	struct sysctl_ctx_list		sysctl_ctx;
65141240Snjl};
66141240Snjl
67141240Snjlstruct cf_setting_array {
68141240Snjl	struct cf_setting		sets[MAX_SETTINGS];
69141240Snjl	int				count;
70141240Snjl	TAILQ_ENTRY(cf_setting_array)	link;
71141240Snjl};
72141240Snjl
73141240SnjlTAILQ_HEAD(cf_setting_lst, cf_setting_array);
74141240Snjl
75141240Snjlstatic int	cpufreq_attach(device_t dev);
76141240Snjlstatic int	cpufreq_detach(device_t dev);
77141240Snjlstatic void	cpufreq_evaluate(void *arg);
78141240Snjlstatic int	cf_set_method(device_t dev, const struct cf_level *level,
79141240Snjl		    int priority);
80141240Snjlstatic int	cf_get_method(device_t dev, struct cf_level *level);
81141240Snjlstatic int	cf_levels_method(device_t dev, struct cf_level *levels,
82141240Snjl		    int *count);
83141413Snjlstatic int	cpufreq_insert_abs(struct cpufreq_softc *sc,
84141240Snjl		    struct cf_setting *sets, int count);
85141413Snjlstatic int	cpufreq_expand_set(struct cpufreq_softc *sc,
86141413Snjl		    struct cf_setting_array *set_arr);
87141413Snjlstatic struct cf_level *cpufreq_dup_set(struct cpufreq_softc *sc,
88141413Snjl		    struct cf_level *dup, struct cf_setting *set);
89141240Snjlstatic int	cpufreq_curr_sysctl(SYSCTL_HANDLER_ARGS);
90141240Snjlstatic int	cpufreq_levels_sysctl(SYSCTL_HANDLER_ARGS);
91141240Snjl
92141240Snjlstatic device_method_t cpufreq_methods[] = {
93141240Snjl	DEVMETHOD(device_probe,		bus_generic_probe),
94141240Snjl	DEVMETHOD(device_attach,	cpufreq_attach),
95141240Snjl	DEVMETHOD(device_detach,	cpufreq_detach),
96141240Snjl
97141240Snjl        DEVMETHOD(cpufreq_set,		cf_set_method),
98141240Snjl        DEVMETHOD(cpufreq_get,		cf_get_method),
99141240Snjl        DEVMETHOD(cpufreq_levels,	cf_levels_method),
100141240Snjl	{0, 0}
101141240Snjl};
102141240Snjlstatic driver_t cpufreq_driver = {
103141240Snjl	"cpufreq", cpufreq_methods, sizeof(struct cpufreq_softc)
104141240Snjl};
105141240Snjlstatic devclass_t cpufreq_dc;
106141240SnjlDRIVER_MODULE(cpufreq, cpu, cpufreq_driver, cpufreq_dc, 0, 0);
107141240Snjl
108141240Snjlstatic eventhandler_tag cf_ev_tag;
109141240Snjl
110141240Snjlstatic int
111141240Snjlcpufreq_attach(device_t dev)
112141240Snjl{
113141240Snjl	struct cpufreq_softc *sc;
114141240Snjl	device_t parent;
115141240Snjl	int numdevs;
116141240Snjl
117141240Snjl	sc = device_get_softc(dev);
118141240Snjl	parent = device_get_parent(dev);
119141240Snjl	sc->dev = dev;
120141240Snjl	sysctl_ctx_init(&sc->sysctl_ctx);
121141240Snjl	TAILQ_INIT(&sc->all_levels);
122141240Snjl	sc->curr_level.total_set.freq = CPUFREQ_VAL_UNKNOWN;
123141240Snjl
124141240Snjl	/*
125141240Snjl	 * Only initialize one set of sysctls for all CPUs.  In the future,
126141240Snjl	 * if multiple CPUs can have different settings, we can move these
127141240Snjl	 * sysctls to be under every CPU instead of just the first one.
128141240Snjl	 */
129141240Snjl	numdevs = devclass_get_count(cpufreq_dc);
130141240Snjl	if (numdevs > 1)
131141240Snjl		return (0);
132141240Snjl
133141240Snjl	SYSCTL_ADD_PROC(&sc->sysctl_ctx,
134141240Snjl	    SYSCTL_CHILDREN(device_get_sysctl_tree(parent)),
135141240Snjl	    OID_AUTO, "freq", CTLTYPE_INT | CTLFLAG_RW, sc, 0,
136141240Snjl	    cpufreq_curr_sysctl, "I", "Current CPU frequency");
137141240Snjl	SYSCTL_ADD_PROC(&sc->sysctl_ctx,
138141240Snjl	    SYSCTL_CHILDREN(device_get_sysctl_tree(parent)),
139141240Snjl	    OID_AUTO, "freq_levels", CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
140141240Snjl	    cpufreq_levels_sysctl, "A", "CPU frequency levels");
141141240Snjl	cf_ev_tag = EVENTHANDLER_REGISTER(cpufreq_changed, cpufreq_evaluate,
142141240Snjl	    NULL, EVENTHANDLER_PRI_ANY);
143141240Snjl
144141240Snjl	return (0);
145141240Snjl}
146141240Snjl
147141240Snjlstatic int
148141240Snjlcpufreq_detach(device_t dev)
149141240Snjl{
150141240Snjl	struct cpufreq_softc *sc;
151141240Snjl	int numdevs;
152141240Snjl
153141240Snjl	sc = device_get_softc(dev);
154141240Snjl	sysctl_ctx_free(&sc->sysctl_ctx);
155141240Snjl
156141240Snjl	/* Only clean up these resources when the last device is detaching. */
157141240Snjl	numdevs = devclass_get_count(cpufreq_dc);
158141240Snjl	if (numdevs == 1)
159141240Snjl		EVENTHANDLER_DEREGISTER(cpufreq_changed, cf_ev_tag);
160141240Snjl
161141240Snjl	return (0);
162141240Snjl}
163141240Snjl
164141240Snjlstatic void
165141240Snjlcpufreq_evaluate(void *arg)
166141240Snjl{
167141240Snjl	/* TODO: Re-evaluate when notified of changes to drivers. */
168141240Snjl}
169141240Snjl
170141240Snjlstatic int
171141240Snjlcf_set_method(device_t dev, const struct cf_level *level, int priority)
172141240Snjl{
173141240Snjl	struct cpufreq_softc *sc;
174141240Snjl	const struct cf_setting *set;
175141814Snjl	struct pcpu *pc;
176141814Snjl	int cpu_id, error, i;
177141240Snjl
178141240Snjl	sc = device_get_softc(dev);
179141240Snjl
180141814Snjl	/*
181141814Snjl	 * Check that the TSC isn't being used as a timecounter.
182141814Snjl	 * If it is, then return EBUSY and refuse to change the
183141814Snjl	 * clock speed.
184141814Snjl	 */
185141814Snjl	if (strcmp(timecounter->tc_name, "TSC") == 0)
186141814Snjl		return (EBUSY);
187141814Snjl
188141240Snjl	/* If already at this level, just return. */
189141240Snjl	if (CPUFREQ_CMP(sc->curr_level.total_set.freq, level->total_set.freq))
190141240Snjl		return (0);
191141240Snjl
192141814Snjl	/* If the setting is for a different CPU, switch to it. */
193141814Snjl	cpu_id = PCPU_GET(cpuid);
194141814Snjl	pc = cpu_get_pcpu(dev);
195141814Snjl	KASSERT(pc, ("NULL pcpu for dev %p", dev));
196141814Snjl	if (cpu_id != pc->pc_cpuid) {
197141814Snjl		mtx_lock_spin(&sched_lock);
198141814Snjl		sched_bind(curthread, pc->pc_cpuid);
199141814Snjl		mtx_unlock_spin(&sched_lock);
200141814Snjl	}
201141814Snjl
202141240Snjl	/* First, set the absolute frequency via its driver. */
203141240Snjl	set = &level->abs_set;
204141240Snjl	if (set->dev) {
205141240Snjl		if (!device_is_attached(set->dev)) {
206141240Snjl			error = ENXIO;
207141240Snjl			goto out;
208141240Snjl		}
209141240Snjl		error = CPUFREQ_DRV_SET(set->dev, set);
210141240Snjl		if (error) {
211141240Snjl			goto out;
212141240Snjl		}
213141240Snjl	}
214141240Snjl
215141413Snjl	/* Next, set any/all relative frequencies via their drivers. */
216141413Snjl	for (i = 0; i < level->rel_count; i++) {
217141413Snjl		set = &level->rel_set[i];
218141413Snjl		if (!device_is_attached(set->dev)) {
219141413Snjl			error = ENXIO;
220141413Snjl			goto out;
221141413Snjl		}
222141413Snjl		error = CPUFREQ_DRV_SET(set->dev, set);
223141413Snjl		if (error) {
224141413Snjl			/* XXX Back out any successful setting? */
225141413Snjl			goto out;
226141413Snjl		}
227141413Snjl	}
228141240Snjl
229141240Snjl	/* Record the current level. */
230141240Snjl	sc->curr_level = *level;
231141240Snjl	sc->priority = priority;
232141240Snjl	error = 0;
233141240Snjl
234141240Snjlout:
235141814Snjl	/* If we switched to another CPU, switch back before exiting. */
236141814Snjl	if (cpu_id != pc->pc_cpuid) {
237141814Snjl		mtx_lock_spin(&sched_lock);
238141814Snjl		sched_unbind(curthread);
239141814Snjl		mtx_unlock_spin(&sched_lock);
240141814Snjl	}
241141240Snjl	if (error)
242141240Snjl		device_printf(set->dev, "set freq failed, err %d\n", error);
243141240Snjl	return (error);
244141240Snjl}
245141240Snjl
246141240Snjlstatic int
247141240Snjlcf_get_method(device_t dev, struct cf_level *level)
248141240Snjl{
249141240Snjl	struct cpufreq_softc *sc;
250141240Snjl	struct cf_level *levels;
251141240Snjl	struct cf_setting *curr_set, set;
252141240Snjl	struct pcpu *pc;
253141240Snjl	device_t *devs;
254141240Snjl	int count, error, i, numdevs;
255141240Snjl	uint64_t rate;
256141240Snjl
257141240Snjl	sc = device_get_softc(dev);
258141240Snjl	curr_set = &sc->curr_level.total_set;
259141240Snjl	levels = NULL;
260141240Snjl
261141240Snjl	/* If we already know the current frequency, we're done. */
262141240Snjl	if (curr_set->freq != CPUFREQ_VAL_UNKNOWN)
263141240Snjl		goto out;
264141240Snjl
265141240Snjl	/*
266141240Snjl	 * We need to figure out the current level.  Loop through every
267141240Snjl	 * driver, getting the current setting.  Then, attempt to get a best
268141240Snjl	 * match of settings against each level.
269141240Snjl	 */
270141240Snjl	count = CF_MAX_LEVELS;
271141240Snjl	levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT);
272141240Snjl	if (levels == NULL)
273141240Snjl		return (ENOMEM);
274141240Snjl	error = CPUFREQ_LEVELS(sc->dev, levels, &count);
275141240Snjl	if (error)
276141240Snjl		goto out;
277141240Snjl	error = device_get_children(device_get_parent(dev), &devs, &numdevs);
278141240Snjl	if (error)
279141240Snjl		goto out;
280141240Snjl	for (i = 0; i < numdevs && curr_set->freq == CPUFREQ_VAL_UNKNOWN; i++) {
281141240Snjl		if (!device_is_attached(devs[i]))
282141240Snjl			continue;
283141240Snjl		error = CPUFREQ_DRV_GET(devs[i], &set);
284141240Snjl		if (error)
285141240Snjl			continue;
286141240Snjl		for (i = 0; i < count; i++) {
287141413Snjl			if (CPUFREQ_CMP(set.freq, levels[i].total_set.freq)) {
288141240Snjl				sc->curr_level = levels[i];
289141240Snjl				break;
290141240Snjl			}
291141240Snjl		}
292141240Snjl	}
293141240Snjl	free(devs, M_TEMP);
294141240Snjl	if (curr_set->freq != CPUFREQ_VAL_UNKNOWN)
295141240Snjl		goto out;
296141240Snjl
297141240Snjl	/*
298141240Snjl	 * We couldn't find an exact match, so attempt to estimate and then
299141240Snjl	 * match against a level.
300141240Snjl	 */
301141240Snjl	pc = cpu_get_pcpu(dev);
302141240Snjl	if (pc == NULL) {
303141240Snjl		error = ENXIO;
304141240Snjl		goto out;
305141240Snjl	}
306141240Snjl	cpu_est_clockrate(pc->pc_cpuid, &rate);
307141240Snjl	rate /= 1000000;
308141240Snjl	for (i = 0; i < count; i++) {
309141240Snjl		if (CPUFREQ_CMP(rate, levels[i].total_set.freq)) {
310141240Snjl			sc->curr_level = levels[i];
311141240Snjl			break;
312141240Snjl		}
313141240Snjl	}
314141240Snjl
315141240Snjlout:
316141240Snjl	if (levels)
317141240Snjl		free(levels, M_TEMP);
318141240Snjl	*level = sc->curr_level;
319141240Snjl	return (0);
320141240Snjl}
321141240Snjl
322141240Snjlstatic int
323141240Snjlcf_levels_method(device_t dev, struct cf_level *levels, int *count)
324141240Snjl{
325141413Snjl	struct cf_setting_array *set_arr;
326141240Snjl	struct cf_setting_lst rel_sets;
327141240Snjl	struct cpufreq_softc *sc;
328141240Snjl	struct cf_level *lev;
329141240Snjl	struct cf_setting *sets;
330141240Snjl	struct pcpu *pc;
331141240Snjl	device_t *devs;
332141413Snjl	int error, i, numdevs, set_count, type;
333141240Snjl	uint64_t rate;
334141240Snjl
335141240Snjl	if (levels == NULL || count == NULL)
336141240Snjl		return (EINVAL);
337141240Snjl
338141240Snjl	TAILQ_INIT(&rel_sets);
339141240Snjl	sc = device_get_softc(dev);
340141240Snjl	error = device_get_children(device_get_parent(dev), &devs, &numdevs);
341141240Snjl	if (error)
342141240Snjl		return (error);
343141240Snjl	sets = malloc(MAX_SETTINGS * sizeof(*sets), M_TEMP, M_NOWAIT);
344141240Snjl	if (sets == NULL) {
345141240Snjl		free(devs, M_TEMP);
346141240Snjl		return (ENOMEM);
347141240Snjl	}
348141240Snjl
349141240Snjl	/* Get settings from all cpufreq drivers. */
350141240Snjl	for (i = 0; i < numdevs; i++) {
351141824Snjl		/* Skip devices that aren't ready. */
352141240Snjl		if (!device_is_attached(devs[i]))
353141240Snjl			continue;
354141824Snjl
355141824Snjl		/*
356141824Snjl		 * Get settings, skipping drivers that offer no settings or
357141824Snjl		 * provide settings for informational purposes only.
358141824Snjl		 */
359141240Snjl		set_count = MAX_SETTINGS;
360141240Snjl		error = CPUFREQ_DRV_SETTINGS(devs[i], sets, &set_count, &type);
361141824Snjl		if (error || set_count == 0 || (type & CPUFREQ_FLAG_INFO_ONLY))
362141240Snjl			continue;
363141413Snjl
364141824Snjl		/* Add the settings to our absolute/relative lists. */
365141814Snjl		switch (type & CPUFREQ_TYPE_MASK) {
366141413Snjl		case CPUFREQ_TYPE_ABSOLUTE:
367141413Snjl			error = cpufreq_insert_abs(sc, sets, set_count);
368141413Snjl			break;
369141413Snjl		case CPUFREQ_TYPE_RELATIVE:
370141413Snjl			set_arr = malloc(sizeof(*set_arr), M_TEMP, M_NOWAIT);
371141413Snjl			if (set_arr == NULL) {
372141413Snjl				error = ENOMEM;
373141413Snjl				goto out;
374141413Snjl			}
375141413Snjl			bcopy(sets, set_arr->sets, set_count * sizeof(*sets));
376141413Snjl			set_arr->count = set_count;
377141413Snjl			TAILQ_INSERT_TAIL(&rel_sets, set_arr, link);
378141413Snjl			break;
379141413Snjl		default:
380141413Snjl			error = EINVAL;
381141413Snjl			break;
382141413Snjl		}
383141240Snjl		if (error)
384141240Snjl			goto out;
385141240Snjl	}
386141240Snjl
387141240Snjl	/* If there are no absolute levels, create a fake one at 100%. */
388141240Snjl	if (TAILQ_EMPTY(&sc->all_levels)) {
389141240Snjl		bzero(&sets[0], sizeof(*sets));
390141240Snjl		pc = cpu_get_pcpu(dev);
391141240Snjl		if (pc == NULL) {
392141240Snjl			error = ENXIO;
393141240Snjl			goto out;
394141240Snjl		}
395141240Snjl		cpu_est_clockrate(pc->pc_cpuid, &rate);
396141240Snjl		sets[0].freq = rate / 1000000;
397141413Snjl		error = cpufreq_insert_abs(sc, sets, 1);
398141240Snjl		if (error)
399141240Snjl			goto out;
400141240Snjl	}
401141240Snjl
402141413Snjl	/* Create a combined list of absolute + relative levels. */
403141413Snjl	TAILQ_FOREACH(set_arr, &rel_sets, link)
404141413Snjl		cpufreq_expand_set(sc, set_arr);
405141413Snjl
406141413Snjl	/* If the caller doesn't have enough space, return the actual count. */
407141413Snjl	if (sc->all_count > *count) {
408141413Snjl		*count = sc->all_count;
409141413Snjl		error = E2BIG;
410141413Snjl		goto out;
411141413Snjl	}
412141413Snjl
413141413Snjl	/* Finally, output the list of levels. */
414141240Snjl	i = 0;
415141240Snjl	TAILQ_FOREACH(lev, &sc->all_levels, link) {
416141240Snjl		levels[i] = *lev;
417141240Snjl		i++;
418141240Snjl	}
419141413Snjl	*count = sc->all_count;
420141240Snjl	error = 0;
421141240Snjl
422141240Snjlout:
423141240Snjl	/* Clear all levels since we regenerate them each time. */
424141240Snjl	while ((lev = TAILQ_FIRST(&sc->all_levels)) != NULL) {
425141240Snjl		TAILQ_REMOVE(&sc->all_levels, lev, link);
426141240Snjl		free(lev, M_TEMP);
427141240Snjl	}
428141413Snjl	while ((set_arr = TAILQ_FIRST(&rel_sets)) != NULL) {
429141413Snjl		TAILQ_REMOVE(&rel_sets, set_arr, link);
430141413Snjl		free(set_arr, M_TEMP);
431141413Snjl	}
432141413Snjl	sc->all_count = 0;
433141240Snjl	free(devs, M_TEMP);
434141240Snjl	free(sets, M_TEMP);
435141240Snjl	return (error);
436141240Snjl}
437141240Snjl
438141240Snjl/*
439141240Snjl * Create levels for an array of absolute settings and insert them in
440141240Snjl * sorted order in the specified list.
441141240Snjl */
442141240Snjlstatic int
443141413Snjlcpufreq_insert_abs(struct cpufreq_softc *sc, struct cf_setting *sets,
444141240Snjl    int count)
445141240Snjl{
446141413Snjl	struct cf_level_lst *list;
447141240Snjl	struct cf_level *level, *search;
448141240Snjl	int i;
449141240Snjl
450141413Snjl	list = &sc->all_levels;
451141240Snjl	for (i = 0; i < count; i++) {
452141240Snjl		level = malloc(sizeof(*level), M_TEMP, M_NOWAIT | M_ZERO);
453141240Snjl		if (level == NULL)
454141240Snjl			return (ENOMEM);
455141240Snjl		level->abs_set = sets[i];
456141413Snjl		level->total_set = sets[i];
457141413Snjl		level->total_set.dev = NULL;
458141413Snjl		sc->all_count++;
459141240Snjl
460141240Snjl		if (TAILQ_EMPTY(list)) {
461141240Snjl			TAILQ_INSERT_HEAD(list, level, link);
462141240Snjl			continue;
463141240Snjl		}
464141240Snjl
465141240Snjl		TAILQ_FOREACH_REVERSE(search, list, cf_level_lst, link) {
466141413Snjl			if (sets[i].freq <= search->total_set.freq) {
467141240Snjl				TAILQ_INSERT_AFTER(list, search, level, link);
468141240Snjl				break;
469141240Snjl			}
470141240Snjl		}
471141240Snjl	}
472141240Snjl	return (0);
473141240Snjl}
474141240Snjl
475141413Snjl/*
476141413Snjl * Expand a group of relative settings, creating derived levels from them.
477141413Snjl */
478141240Snjlstatic int
479141413Snjlcpufreq_expand_set(struct cpufreq_softc *sc, struct cf_setting_array *set_arr)
480141413Snjl{
481141413Snjl	struct cf_level *fill, *search;
482141413Snjl	struct cf_setting *set;
483141413Snjl	int i;
484141413Snjl
485141413Snjl	TAILQ_FOREACH(search, &sc->all_levels, link) {
486141413Snjl		/* Skip this level if we've already modified it. */
487141413Snjl		for (i = 0; i < search->rel_count; i++) {
488141413Snjl			if (search->rel_set[i].dev == set_arr->sets[0].dev)
489141413Snjl				break;
490141413Snjl		}
491141413Snjl		if (i != search->rel_count)
492141413Snjl			continue;
493141413Snjl
494141413Snjl		/* Add each setting to the level, duplicating if necessary. */
495141413Snjl		for (i = 0; i < set_arr->count; i++) {
496141413Snjl			set = &set_arr->sets[i];
497141413Snjl
498141413Snjl			/*
499141413Snjl			 * If this setting is less than 100%, split the level
500141413Snjl			 * into two and add this setting to the new level.
501141413Snjl			 */
502141413Snjl			fill = search;
503141413Snjl			if (set->freq < 10000)
504141413Snjl				fill = cpufreq_dup_set(sc, search, set);
505141413Snjl
506141413Snjl			/*
507141413Snjl			 * The new level was a duplicate of an existing level
508141413Snjl			 * so we freed it.  Go to the next setting.
509141413Snjl			 */
510141413Snjl			if (fill == NULL)
511141413Snjl				continue;
512141413Snjl
513141413Snjl			/* Add this setting to the existing or new level. */
514141413Snjl			KASSERT(fill->rel_count < MAX_SETTINGS,
515141413Snjl			    ("cpufreq: too many relative drivers (%d)",
516141413Snjl			    MAX_SETTINGS));
517141413Snjl			fill->rel_set[fill->rel_count] = *set;
518141413Snjl			fill->rel_count++;
519141413Snjl		}
520141413Snjl	}
521141413Snjl
522141413Snjl	return (0);
523141413Snjl}
524141413Snjl
525141413Snjlstatic struct cf_level *
526141413Snjlcpufreq_dup_set(struct cpufreq_softc *sc, struct cf_level *dup,
527141413Snjl    struct cf_setting *set)
528141413Snjl{
529141413Snjl	struct cf_level_lst *list;
530141413Snjl	struct cf_level *fill, *itr;
531141413Snjl	struct cf_setting *fill_set, *itr_set;
532141413Snjl	int i;
533141413Snjl
534141413Snjl	/*
535141413Snjl	 * Create a new level, copy it from the old one, and update the
536141413Snjl	 * total frequency and power by the percentage specified in the
537141413Snjl	 * relative setting.
538141413Snjl	 */
539141413Snjl	fill = malloc(sizeof(*fill), M_TEMP, M_NOWAIT);
540141413Snjl	if (fill == NULL)
541141413Snjl		return (NULL);
542141413Snjl	*fill = *dup;
543141413Snjl	fill_set = &fill->total_set;
544141413Snjl	fill_set->freq =
545141413Snjl	    ((uint64_t)fill_set->freq * set->freq) / 10000;
546141413Snjl	if (fill_set->power != CPUFREQ_VAL_UNKNOWN) {
547141413Snjl		fill_set->power = ((uint64_t)fill_set->power * set->freq)
548141413Snjl		    / 10000;
549141413Snjl	}
550141413Snjl	if (set->lat != CPUFREQ_VAL_UNKNOWN) {
551141413Snjl		if (fill_set->lat != CPUFREQ_VAL_UNKNOWN)
552141413Snjl			fill_set->lat += set->lat;
553141413Snjl		else
554141413Snjl			fill_set->lat = set->lat;
555141413Snjl	}
556141413Snjl
557141413Snjl	/*
558141413Snjl	 * If we copied an old level that we already modified (say, at 100%),
559141413Snjl	 * we need to remove that setting before adding this one.  Since we
560141413Snjl	 * process each setting array in order, we know any settings for this
561141413Snjl	 * driver will be found at the end.
562141413Snjl	 */
563141413Snjl	for (i = fill->rel_count; i != 0; i--) {
564141413Snjl		if (fill->rel_set[i - 1].dev != set->dev)
565141413Snjl			break;
566141413Snjl		fill->rel_count--;
567141413Snjl	}
568141413Snjl
569141413Snjl	/*
570141413Snjl	 * Insert the new level in sorted order.  If we find a duplicate,
571141413Snjl	 * free the new level.  We can do this since any existing level will
572141413Snjl	 * be guaranteed to have the same or less settings and thus consume
573141413Snjl	 * less power.  For example, a level with one absolute setting of
574141413Snjl	 * 800 Mhz uses less power than one composed of an absolute setting
575141413Snjl	 * of 1600 Mhz and a relative setting at 50%.
576141413Snjl	 */
577141413Snjl	list = &sc->all_levels;
578141413Snjl	if (TAILQ_EMPTY(list)) {
579141413Snjl		TAILQ_INSERT_HEAD(list, fill, link);
580141413Snjl	} else {
581141413Snjl		TAILQ_FOREACH_REVERSE(itr, list, cf_level_lst, link) {
582141413Snjl			itr_set = &itr->total_set;
583141413Snjl			if (CPUFREQ_CMP(fill_set->freq, itr_set->freq)) {
584141413Snjl				free(fill, M_TEMP);
585141413Snjl				fill = NULL;
586141413Snjl				break;
587141413Snjl			} else if (fill_set->freq < itr_set->freq) {
588141413Snjl				TAILQ_INSERT_AFTER(list, itr, fill, link);
589141413Snjl				sc->all_count++;
590141413Snjl				break;
591141413Snjl			}
592141413Snjl		}
593141413Snjl	}
594141413Snjl
595141413Snjl	return (fill);
596141413Snjl}
597141413Snjl
598141413Snjlstatic int
599141240Snjlcpufreq_curr_sysctl(SYSCTL_HANDLER_ARGS)
600141240Snjl{
601141240Snjl	struct cpufreq_softc *sc;
602141240Snjl	struct cf_level *levels;
603141814Snjl	int count, devcount, error, freq, i, n;
604141814Snjl	device_t *devs;
605141240Snjl
606141814Snjl	devs = NULL;
607141240Snjl	sc = oidp->oid_arg1;
608141814Snjl	levels = malloc(CF_MAX_LEVELS * sizeof(*levels), M_TEMP, M_NOWAIT);
609141240Snjl	if (levels == NULL)
610141240Snjl		return (ENOMEM);
611141240Snjl
612141240Snjl	error = CPUFREQ_GET(sc->dev, &levels[0]);
613141240Snjl	if (error)
614141240Snjl		goto out;
615141240Snjl	freq = levels[0].total_set.freq;
616141240Snjl	error = sysctl_handle_int(oidp, &freq, 0, req);
617141240Snjl	if (error != 0 || req->newptr == NULL)
618141240Snjl		goto out;
619141240Snjl
620141814Snjl	/*
621141814Snjl	 * While we only call cpufreq_get() on one device (assuming all
622141814Snjl	 * CPUs have equal levels), we call cpufreq_set() on all CPUs.
623141814Snjl	 * This is needed for some MP systems.
624141814Snjl	 */
625141814Snjl	error = devclass_get_devices(cpufreq_dc, &devs, &devcount);
626141240Snjl	if (error)
627141240Snjl		goto out;
628141814Snjl	for (n = 0; n < devcount; n++) {
629141814Snjl		count = CF_MAX_LEVELS;
630141814Snjl		error = CPUFREQ_LEVELS(devs[n], levels, &count);
631141814Snjl		if (error)
632141240Snjl			break;
633141814Snjl		for (i = 0; i < count; i++) {
634141814Snjl			if (CPUFREQ_CMP(levels[i].total_set.freq, freq)) {
635141814Snjl				error = CPUFREQ_SET(devs[n], &levels[i],
636141814Snjl				    CPUFREQ_PRIO_USER);
637141814Snjl				break;
638141814Snjl			}
639141240Snjl		}
640141814Snjl		if (i == count) {
641141814Snjl			error = EINVAL;
642141814Snjl			break;
643141814Snjl		}
644141240Snjl	}
645141240Snjl
646141240Snjlout:
647141814Snjl	if (devs)
648141814Snjl		free(devs, M_TEMP);
649141240Snjl	if (levels)
650141240Snjl		free(levels, M_TEMP);
651141240Snjl	return (error);
652141240Snjl}
653141240Snjl
654141240Snjlstatic int
655141240Snjlcpufreq_levels_sysctl(SYSCTL_HANDLER_ARGS)
656141240Snjl{
657141240Snjl	struct cpufreq_softc *sc;
658141240Snjl	struct cf_level *levels;
659141240Snjl	struct cf_setting *set;
660141240Snjl	struct sbuf sb;
661141240Snjl	int count, error, i;
662141240Snjl
663141240Snjl	sc = oidp->oid_arg1;
664141240Snjl	sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND);
665141240Snjl
666141240Snjl	/* Get settings from the device and generate the output string. */
667141240Snjl	count = CF_MAX_LEVELS;
668141240Snjl	levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT);
669141240Snjl	if (levels == NULL)
670141240Snjl		return (ENOMEM);
671141240Snjl	error = CPUFREQ_LEVELS(sc->dev, levels, &count);
672141240Snjl	if (error)
673141240Snjl		goto out;
674141240Snjl	if (count) {
675141240Snjl		for (i = 0; i < count; i++) {
676141240Snjl			set = &levels[i].total_set;
677141240Snjl			sbuf_printf(&sb, "%d/%d ", set->freq, set->power);
678141240Snjl		}
679141240Snjl	} else
680141240Snjl		sbuf_cpy(&sb, "0");
681141240Snjl	sbuf_trim(&sb);
682141240Snjl	sbuf_finish(&sb);
683141240Snjl	error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
684141240Snjl
685141240Snjlout:
686141240Snjl	free(levels, M_TEMP);
687141240Snjl	sbuf_delete(&sb);
688141240Snjl	return (error);
689141240Snjl}
690141240Snjl
691141240Snjlint
692141240Snjlcpufreq_register(device_t dev)
693141240Snjl{
694141240Snjl	device_t cf_dev, cpu_dev;
695141240Snjl
696141240Snjl	/*
697141814Snjl	 * Add only one cpufreq device to each CPU.  Currently, all CPUs
698141814Snjl	 * must offer the same levels and be switched at the same time.
699141240Snjl	 */
700141814Snjl	cpu_dev = device_get_parent(dev);
701141814Snjl	KASSERT(cpu_dev != NULL, ("no parent for %p", dev));
702141814Snjl	if (device_find_child(cpu_dev, "cpufreq", -1))
703141240Snjl		return (0);
704141240Snjl
705141814Snjl	/* Add the child device and possibly sysctls. */
706141814Snjl	cf_dev = BUS_ADD_CHILD(cpu_dev, 0, "cpufreq", -1);
707141240Snjl	if (cf_dev == NULL)
708141240Snjl		return (ENOMEM);
709141240Snjl	device_quiet(cf_dev);
710141240Snjl
711141240Snjl	return (device_probe_and_attach(cf_dev));
712141240Snjl}
713141240Snjl
714141240Snjlint
715141240Snjlcpufreq_unregister(device_t dev)
716141240Snjl{
717141240Snjl	device_t cf_dev, *devs;
718141240Snjl	int cfcount, count, devcount, error, i, type;
719141240Snjl	struct cf_setting set;
720141240Snjl
721141240Snjl	/*
722141240Snjl	 * If this is the last cpufreq child device, remove the control
723141240Snjl	 * device as well.  We identify cpufreq children by calling a method
724141240Snjl	 * they support.
725141240Snjl	 */
726141240Snjl	error = device_get_children(device_get_parent(dev), &devs, &devcount);
727141240Snjl	if (error)
728141240Snjl		return (error);
729141240Snjl	cf_dev = devclass_get_device(cpufreq_dc, 0);
730141240Snjl	KASSERT(cf_dev != NULL, ("unregister with no cpufreq dev"));
731141240Snjl	cfcount = 0;
732141240Snjl	for (i = 0; i < devcount; i++) {
733141240Snjl		if (!device_is_attached(devs[i]))
734141240Snjl			continue;
735141240Snjl		count = 1;
736141240Snjl		if (CPUFREQ_DRV_SETTINGS(devs[i], &set, &count, &type) == 0)
737141240Snjl			cfcount++;
738141240Snjl	}
739141814Snjl	if (cfcount <= 1)
740141240Snjl		device_delete_child(device_get_parent(cf_dev), cf_dev);
741141240Snjl	free(devs, M_TEMP);
742141240Snjl
743141240Snjl	return (0);
744141240Snjl}
745