1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2004-2007 Nate Lawson (SDG)
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/param.h>
30#include <sys/bus.h>
31#include <sys/cpu.h>
32#include <sys/eventhandler.h>
33#include <sys/kernel.h>
34#include <sys/lock.h>
35#include <sys/malloc.h>
36#include <sys/module.h>
37#include <sys/proc.h>
38#include <sys/queue.h>
39#include <sys/sbuf.h>
40#include <sys/sched.h>
41#include <sys/smp.h>
42#include <sys/sysctl.h>
43#include <sys/systm.h>
44#include <sys/sx.h>
45#include <sys/timetc.h>
46#include <sys/taskqueue.h>
47
48#include "cpufreq_if.h"
49
50/*
51 * Common CPU frequency glue code.  Drivers for specific hardware can
52 * attach this interface to allow users to get/set the CPU frequency.
53 */
54
55/*
56 * Number of levels we can handle.  Levels are synthesized from settings
57 * so for M settings and N drivers, there may be M*N levels.
58 */
59#define CF_MAX_LEVELS	256
60
61struct cf_saved_freq {
62	struct cf_level			level;
63	int				priority;
64	SLIST_ENTRY(cf_saved_freq)	link;
65};
66
67struct cpufreq_softc {
68	struct sx			lock;
69	struct cf_level			curr_level;
70	int				curr_priority;
71	SLIST_HEAD(, cf_saved_freq)	saved_freq;
72	struct cf_level_lst		all_levels;
73	int				all_count;
74	int				max_mhz;
75	device_t			dev;
76	device_t			cf_drv_dev;
77	struct sysctl_ctx_list		sysctl_ctx;
78	struct task			startup_task;
79	struct cf_level			*levels_buf;
80};
81
82struct cf_setting_array {
83	struct cf_setting		sets[MAX_SETTINGS];
84	int				count;
85	TAILQ_ENTRY(cf_setting_array)	link;
86};
87
88TAILQ_HEAD(cf_setting_lst, cf_setting_array);
89
90#define CF_MTX_INIT(x)		sx_init((x), "cpufreq lock")
91#define CF_MTX_LOCK(x)		sx_xlock((x))
92#define CF_MTX_UNLOCK(x)	sx_xunlock((x))
93#define CF_MTX_ASSERT(x)	sx_assert((x), SX_XLOCKED)
94
95#define CF_DEBUG(msg...)	do {		\
96	if (cf_verbose)				\
97		printf("cpufreq: " msg);	\
98	} while (0)
99
100static int	cpufreq_attach(device_t dev);
101static void	cpufreq_startup_task(void *ctx, int pending);
102static int	cpufreq_detach(device_t dev);
103static int	cf_set_method(device_t dev, const struct cf_level *level,
104		    int priority);
105static int	cf_get_method(device_t dev, struct cf_level *level);
106static int	cf_levels_method(device_t dev, struct cf_level *levels,
107		    int *count);
108static int	cpufreq_insert_abs(struct cpufreq_softc *sc,
109		    struct cf_setting *sets, int count);
110static int	cpufreq_expand_set(struct cpufreq_softc *sc,
111		    struct cf_setting_array *set_arr);
112static struct cf_level *cpufreq_dup_set(struct cpufreq_softc *sc,
113		    struct cf_level *dup, struct cf_setting *set);
114static int	cpufreq_curr_sysctl(SYSCTL_HANDLER_ARGS);
115static int	cpufreq_levels_sysctl(SYSCTL_HANDLER_ARGS);
116static int	cpufreq_settings_sysctl(SYSCTL_HANDLER_ARGS);
117
118static device_method_t cpufreq_methods[] = {
119	DEVMETHOD(device_probe,		bus_generic_probe),
120	DEVMETHOD(device_attach,	cpufreq_attach),
121	DEVMETHOD(device_detach,	cpufreq_detach),
122
123        DEVMETHOD(cpufreq_set,		cf_set_method),
124        DEVMETHOD(cpufreq_get,		cf_get_method),
125        DEVMETHOD(cpufreq_levels,	cf_levels_method),
126	{0, 0}
127};
128
129static driver_t cpufreq_driver = {
130	"cpufreq", cpufreq_methods, sizeof(struct cpufreq_softc)
131};
132
133DRIVER_MODULE(cpufreq, cpu, cpufreq_driver, 0, 0);
134
135static int		cf_lowest_freq;
136static int		cf_verbose;
137static SYSCTL_NODE(_debug, OID_AUTO, cpufreq, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
138    "cpufreq debugging");
139SYSCTL_INT(_debug_cpufreq, OID_AUTO, lowest, CTLFLAG_RWTUN, &cf_lowest_freq, 1,
140    "Don't provide levels below this frequency.");
141SYSCTL_INT(_debug_cpufreq, OID_AUTO, verbose, CTLFLAG_RWTUN, &cf_verbose, 1,
142    "Print verbose debugging messages");
143
144/*
145 * This is called as the result of a hardware specific frequency control driver
146 * calling cpufreq_register. It provides a general interface for system wide
147 * frequency controls and operates on a per cpu basis.
148 */
149static int
150cpufreq_attach(device_t dev)
151{
152	struct cpufreq_softc *sc;
153	struct pcpu *pc;
154	device_t parent;
155	uint64_t rate;
156
157	CF_DEBUG("initializing %s\n", device_get_nameunit(dev));
158	sc = device_get_softc(dev);
159	parent = device_get_parent(dev);
160	sc->dev = dev;
161	sysctl_ctx_init(&sc->sysctl_ctx);
162	TAILQ_INIT(&sc->all_levels);
163	CF_MTX_INIT(&sc->lock);
164	sc->curr_level.total_set.freq = CPUFREQ_VAL_UNKNOWN;
165	SLIST_INIT(&sc->saved_freq);
166	/* Try to get nominal CPU freq to use it as maximum later if needed */
167	sc->max_mhz = cpu_get_nominal_mhz(dev);
168	/* If that fails, try to measure the current rate */
169	if (sc->max_mhz <= 0) {
170		CF_DEBUG("Unable to obtain nominal frequency.\n");
171		pc = cpu_get_pcpu(dev);
172		if (cpu_est_clockrate(pc->pc_cpuid, &rate) == 0)
173			sc->max_mhz = rate / 1000000;
174		else
175			sc->max_mhz = CPUFREQ_VAL_UNKNOWN;
176	}
177
178	CF_DEBUG("initializing one-time data for %s\n",
179	    device_get_nameunit(dev));
180	sc->levels_buf = malloc(CF_MAX_LEVELS * sizeof(*sc->levels_buf),
181	    M_DEVBUF, M_WAITOK);
182	SYSCTL_ADD_PROC(&sc->sysctl_ctx,
183	    SYSCTL_CHILDREN(device_get_sysctl_tree(parent)),
184	    OID_AUTO, "freq", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
185	    sc, 0, cpufreq_curr_sysctl, "I", "Current CPU frequency");
186	SYSCTL_ADD_PROC(&sc->sysctl_ctx,
187	    SYSCTL_CHILDREN(device_get_sysctl_tree(parent)),
188	    OID_AUTO, "freq_levels",
189	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, sc, 0,
190	    cpufreq_levels_sysctl, "A", "CPU frequency levels");
191
192	/*
193	 * Queue a one-shot broadcast that levels have changed.
194	 * It will run once the system has completed booting.
195	 */
196	TASK_INIT(&sc->startup_task, 0, cpufreq_startup_task, dev);
197	taskqueue_enqueue(taskqueue_thread, &sc->startup_task);
198
199	return (0);
200}
201
202/* Handle any work to be done for all drivers that attached during boot. */
203static void
204cpufreq_startup_task(void *ctx, int pending)
205{
206
207	cpufreq_settings_changed((device_t)ctx);
208}
209
210static int
211cpufreq_detach(device_t dev)
212{
213	struct cpufreq_softc *sc;
214	struct cf_saved_freq *saved_freq;
215
216	CF_DEBUG("shutdown %s\n", device_get_nameunit(dev));
217	sc = device_get_softc(dev);
218	sysctl_ctx_free(&sc->sysctl_ctx);
219
220	while ((saved_freq = SLIST_FIRST(&sc->saved_freq)) != NULL) {
221		SLIST_REMOVE_HEAD(&sc->saved_freq, link);
222		free(saved_freq, M_TEMP);
223	}
224
225	free(sc->levels_buf, M_DEVBUF);
226
227	return (0);
228}
229
230static int
231cf_set_method(device_t dev, const struct cf_level *level, int priority)
232{
233	struct cpufreq_softc *sc;
234	const struct cf_setting *set;
235	struct cf_saved_freq *saved_freq, *curr_freq;
236	struct pcpu *pc;
237	int error, i;
238	u_char pri;
239
240	sc = device_get_softc(dev);
241	error = 0;
242	set = NULL;
243	saved_freq = NULL;
244
245	/* We are going to change levels so notify the pre-change handler. */
246	EVENTHANDLER_INVOKE(cpufreq_pre_change, level, &error);
247	if (error != 0) {
248		EVENTHANDLER_INVOKE(cpufreq_post_change, level, error);
249		return (error);
250	}
251
252	CF_MTX_LOCK(&sc->lock);
253
254#ifdef SMP
255#ifdef EARLY_AP_STARTUP
256	MPASS(mp_ncpus == 1 || smp_started);
257#else
258	/*
259	 * If still booting and secondary CPUs not started yet, don't allow
260	 * changing the frequency until they're online.  This is because we
261	 * can't switch to them using sched_bind() and thus we'd only be
262	 * switching the main CPU.  XXXTODO: Need to think more about how to
263	 * handle having different CPUs at different frequencies.
264	 */
265	if (mp_ncpus > 1 && !smp_started) {
266		device_printf(dev, "rejecting change, SMP not started yet\n");
267		error = ENXIO;
268		goto out;
269	}
270#endif
271#endif /* SMP */
272
273	/*
274	 * If the requested level has a lower priority, don't allow
275	 * the new level right now.
276	 */
277	if (priority < sc->curr_priority) {
278		CF_DEBUG("ignoring, curr prio %d less than %d\n", priority,
279		    sc->curr_priority);
280		error = EPERM;
281		goto out;
282	}
283
284	/*
285	 * If the caller didn't specify a level and one is saved, prepare to
286	 * restore the saved level.  If none has been saved, return an error.
287	 */
288	if (level == NULL) {
289		saved_freq = SLIST_FIRST(&sc->saved_freq);
290		if (saved_freq == NULL) {
291			CF_DEBUG("NULL level, no saved level\n");
292			error = ENXIO;
293			goto out;
294		}
295		level = &saved_freq->level;
296		priority = saved_freq->priority;
297		CF_DEBUG("restoring saved level, freq %d prio %d\n",
298		    level->total_set.freq, priority);
299	}
300
301	/* Reject levels that are below our specified threshold. */
302	if (level->total_set.freq < cf_lowest_freq) {
303		CF_DEBUG("rejecting freq %d, less than %d limit\n",
304		    level->total_set.freq, cf_lowest_freq);
305		error = EINVAL;
306		goto out;
307	}
308
309	/* If already at this level, just return. */
310	if (sc->curr_level.total_set.freq == level->total_set.freq) {
311		CF_DEBUG("skipping freq %d, same as current level %d\n",
312		    level->total_set.freq, sc->curr_level.total_set.freq);
313		goto skip;
314	}
315
316	/* First, set the absolute frequency via its driver. */
317	set = &level->abs_set;
318	if (set->dev) {
319		if (!device_is_attached(set->dev)) {
320			error = ENXIO;
321			goto out;
322		}
323
324		/* Bind to the target CPU before switching. */
325		pc = cpu_get_pcpu(set->dev);
326
327		/* Skip settings if CPU is not started. */
328		if (pc == NULL) {
329			error = 0;
330			goto out;
331		}
332		thread_lock(curthread);
333		pri = curthread->td_priority;
334		sched_prio(curthread, PRI_MIN);
335		sched_bind(curthread, pc->pc_cpuid);
336		thread_unlock(curthread);
337		CF_DEBUG("setting abs freq %d on %s (cpu %d)\n", set->freq,
338		    device_get_nameunit(set->dev), PCPU_GET(cpuid));
339		error = CPUFREQ_DRV_SET(set->dev, set);
340		thread_lock(curthread);
341		sched_unbind(curthread);
342		sched_prio(curthread, pri);
343		thread_unlock(curthread);
344		if (error) {
345			goto out;
346		}
347	}
348
349	/* Next, set any/all relative frequencies via their drivers. */
350	for (i = 0; i < level->rel_count; i++) {
351		set = &level->rel_set[i];
352		if (!device_is_attached(set->dev)) {
353			error = ENXIO;
354			goto out;
355		}
356
357		/* Bind to the target CPU before switching. */
358		pc = cpu_get_pcpu(set->dev);
359		thread_lock(curthread);
360		pri = curthread->td_priority;
361		sched_prio(curthread, PRI_MIN);
362		sched_bind(curthread, pc->pc_cpuid);
363		thread_unlock(curthread);
364		CF_DEBUG("setting rel freq %d on %s (cpu %d)\n", set->freq,
365		    device_get_nameunit(set->dev), PCPU_GET(cpuid));
366		error = CPUFREQ_DRV_SET(set->dev, set);
367		thread_lock(curthread);
368		sched_unbind(curthread);
369		sched_prio(curthread, pri);
370		thread_unlock(curthread);
371		if (error) {
372			/* XXX Back out any successful setting? */
373			goto out;
374		}
375	}
376
377skip:
378	/*
379	 * Before recording the current level, check if we're going to a
380	 * higher priority.  If so, save the previous level and priority.
381	 */
382	if (sc->curr_level.total_set.freq != CPUFREQ_VAL_UNKNOWN &&
383	    priority > sc->curr_priority) {
384		CF_DEBUG("saving level, freq %d prio %d\n",
385		    sc->curr_level.total_set.freq, sc->curr_priority);
386		curr_freq = malloc(sizeof(*curr_freq), M_TEMP, M_NOWAIT);
387		if (curr_freq == NULL) {
388			error = ENOMEM;
389			goto out;
390		}
391		curr_freq->level = sc->curr_level;
392		curr_freq->priority = sc->curr_priority;
393		SLIST_INSERT_HEAD(&sc->saved_freq, curr_freq, link);
394	}
395	sc->curr_level = *level;
396	sc->curr_priority = priority;
397
398	/* If we were restoring a saved state, reset it to "unused". */
399	if (saved_freq != NULL) {
400		CF_DEBUG("resetting saved level\n");
401		sc->curr_level.total_set.freq = CPUFREQ_VAL_UNKNOWN;
402		SLIST_REMOVE_HEAD(&sc->saved_freq, link);
403		free(saved_freq, M_TEMP);
404	}
405
406out:
407	CF_MTX_UNLOCK(&sc->lock);
408
409	/*
410	 * We changed levels (or attempted to) so notify the post-change
411	 * handler of new frequency or error.
412	 */
413	EVENTHANDLER_INVOKE(cpufreq_post_change, level, error);
414	if (error && set)
415		device_printf(set->dev, "set freq failed, err %d\n", error);
416
417	return (error);
418}
419
420static int
421cpufreq_get_frequency(device_t dev)
422{
423	struct cf_setting set;
424
425	if (CPUFREQ_DRV_GET(dev, &set) != 0)
426		return (-1);
427
428	return (set.freq);
429}
430
431/* Returns the index into *levels with the match */
432static int
433cpufreq_get_level(device_t dev, struct cf_level *levels, int count)
434{
435	int i, freq;
436
437	if ((freq = cpufreq_get_frequency(dev)) < 0)
438		return (-1);
439	for (i = 0; i < count; i++)
440		if (freq == levels[i].total_set.freq)
441			return (i);
442
443	return (-1);
444}
445
446/*
447 * Used by the cpufreq core, this function will populate *level with the current
448 * frequency as either determined by a cached value sc->curr_level, or in the
449 * case the lower level driver has set the CPUFREQ_FLAG_UNCACHED flag, it will
450 * obtain the frequency from the driver itself.
451 */
452static int
453cf_get_method(device_t dev, struct cf_level *level)
454{
455	struct cpufreq_softc *sc;
456	struct cf_level *levels;
457	struct cf_setting *curr_set;
458	struct pcpu *pc;
459	int bdiff, count, diff, error, i, type;
460	uint64_t rate;
461
462	sc = device_get_softc(dev);
463	error = 0;
464	levels = NULL;
465
466	/*
467	 * If we already know the current frequency, and the driver didn't ask
468	 * for uncached usage, we're done.
469	 */
470	CF_MTX_LOCK(&sc->lock);
471	curr_set = &sc->curr_level.total_set;
472	error = CPUFREQ_DRV_TYPE(sc->cf_drv_dev, &type);
473	if (error == 0 && (type & CPUFREQ_FLAG_UNCACHED)) {
474		struct cf_setting set;
475
476		/*
477		 * If the driver wants to always report back the real frequency,
478		 * first try the driver and if that fails, fall back to
479		 * estimating.
480		 */
481		if (CPUFREQ_DRV_GET(sc->cf_drv_dev, &set) == 0) {
482			sc->curr_level.total_set = set;
483			CF_DEBUG("get returning immediate freq %d\n",
484			    curr_set->freq);
485			goto out;
486		}
487	} else if (curr_set->freq != CPUFREQ_VAL_UNKNOWN) {
488		CF_DEBUG("get returning known freq %d\n", curr_set->freq);
489		error = 0;
490		goto out;
491	}
492	CF_MTX_UNLOCK(&sc->lock);
493
494	/*
495	 * We need to figure out the current level.  Loop through every
496	 * driver, getting the current setting.  Then, attempt to get a best
497	 * match of settings against each level.
498	 */
499	count = CF_MAX_LEVELS;
500	levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT);
501	if (levels == NULL)
502		return (ENOMEM);
503	error = CPUFREQ_LEVELS(sc->dev, levels, &count);
504	if (error) {
505		if (error == E2BIG)
506			printf("cpufreq: need to increase CF_MAX_LEVELS\n");
507		free(levels, M_TEMP);
508		return (error);
509	}
510
511	/*
512	 * Reacquire the lock and search for the given level.
513	 *
514	 * XXX Note: this is not quite right since we really need to go
515	 * through each level and compare both absolute and relative
516	 * settings for each driver in the system before making a match.
517	 * The estimation code below catches this case though.
518	 */
519	CF_MTX_LOCK(&sc->lock);
520	i = cpufreq_get_level(sc->cf_drv_dev, levels, count);
521	if (i >= 0)
522		sc->curr_level = levels[i];
523	else
524		CF_DEBUG("Couldn't find supported level for %s\n",
525		    device_get_nameunit(sc->cf_drv_dev));
526
527	if (curr_set->freq != CPUFREQ_VAL_UNKNOWN) {
528		CF_DEBUG("get matched freq %d from drivers\n", curr_set->freq);
529		goto out;
530	}
531
532	/*
533	 * We couldn't find an exact match, so attempt to estimate and then
534	 * match against a level.
535	 */
536	pc = cpu_get_pcpu(dev);
537	if (pc == NULL) {
538		error = ENXIO;
539		goto out;
540	}
541	cpu_est_clockrate(pc->pc_cpuid, &rate);
542	rate /= 1000000;
543	bdiff = 1 << 30;
544	for (i = 0; i < count; i++) {
545		diff = abs(levels[i].total_set.freq - rate);
546		if (diff < bdiff) {
547			bdiff = diff;
548			sc->curr_level = levels[i];
549		}
550	}
551	CF_DEBUG("get estimated freq %d\n", curr_set->freq);
552
553out:
554	if (error == 0)
555		*level = sc->curr_level;
556
557	CF_MTX_UNLOCK(&sc->lock);
558	if (levels)
559		free(levels, M_TEMP);
560	return (error);
561}
562
563/*
564 * Either directly obtain settings from the cpufreq driver, or build a list of
565 * relative settings to be integrated later against an absolute max.
566 */
567static int
568cpufreq_add_levels(device_t cf_dev, struct cf_setting_lst *rel_sets)
569{
570	struct cf_setting_array *set_arr;
571	struct cf_setting *sets;
572	device_t dev;
573	struct cpufreq_softc *sc;
574	int type, set_count, error;
575
576	sc = device_get_softc(cf_dev);
577	dev = sc->cf_drv_dev;
578
579	/* Skip devices that aren't ready. */
580	if (!device_is_attached(cf_dev))
581		return (0);
582
583	/*
584	 * Get settings, skipping drivers that offer no settings or
585	 * provide settings for informational purposes only.
586	 */
587	error = CPUFREQ_DRV_TYPE(dev, &type);
588	if (error != 0 || (type & CPUFREQ_FLAG_INFO_ONLY)) {
589		if (error == 0) {
590			CF_DEBUG("skipping info-only driver %s\n",
591			    device_get_nameunit(cf_dev));
592		}
593		return (error);
594	}
595
596	sets = malloc(MAX_SETTINGS * sizeof(*sets), M_TEMP, M_NOWAIT);
597	if (sets == NULL)
598		return (ENOMEM);
599
600	set_count = MAX_SETTINGS;
601	error = CPUFREQ_DRV_SETTINGS(dev, sets, &set_count);
602	if (error != 0 || set_count == 0)
603		goto out;
604
605	/* Add the settings to our absolute/relative lists. */
606	switch (type & CPUFREQ_TYPE_MASK) {
607	case CPUFREQ_TYPE_ABSOLUTE:
608		error = cpufreq_insert_abs(sc, sets, set_count);
609		break;
610	case CPUFREQ_TYPE_RELATIVE:
611		CF_DEBUG("adding %d relative settings\n", set_count);
612		set_arr = malloc(sizeof(*set_arr), M_TEMP, M_NOWAIT);
613		if (set_arr == NULL) {
614			error = ENOMEM;
615			goto out;
616		}
617		bcopy(sets, set_arr->sets, set_count * sizeof(*sets));
618		set_arr->count = set_count;
619		TAILQ_INSERT_TAIL(rel_sets, set_arr, link);
620		break;
621	default:
622		error = EINVAL;
623	}
624
625out:
626	free(sets, M_TEMP);
627	return (error);
628}
629
630static int
631cf_levels_method(device_t dev, struct cf_level *levels, int *count)
632{
633	struct cf_setting_array *set_arr;
634	struct cf_setting_lst rel_sets;
635	struct cpufreq_softc *sc;
636	struct cf_level *lev;
637	struct pcpu *pc;
638	int error, i;
639	uint64_t rate;
640
641	if (levels == NULL || count == NULL)
642		return (EINVAL);
643
644	TAILQ_INIT(&rel_sets);
645	sc = device_get_softc(dev);
646
647	CF_MTX_LOCK(&sc->lock);
648	error = cpufreq_add_levels(sc->dev, &rel_sets);
649	if (error)
650		goto out;
651
652	/*
653	 * If there are no absolute levels, create a fake one at 100%.  We
654	 * then cache the clockrate for later use as our base frequency.
655	 */
656	if (TAILQ_EMPTY(&sc->all_levels)) {
657		struct cf_setting set;
658
659		CF_DEBUG("No absolute levels returned by driver\n");
660
661		if (sc->max_mhz == CPUFREQ_VAL_UNKNOWN) {
662			sc->max_mhz = cpu_get_nominal_mhz(dev);
663			/*
664			 * If the CPU can't report a rate for 100%, hope
665			 * the CPU is running at its nominal rate right now,
666			 * and use that instead.
667			 */
668			if (sc->max_mhz <= 0) {
669				pc = cpu_get_pcpu(dev);
670				cpu_est_clockrate(pc->pc_cpuid, &rate);
671				sc->max_mhz = rate / 1000000;
672			}
673		}
674		memset(&set, CPUFREQ_VAL_UNKNOWN, sizeof(set));
675		set.freq = sc->max_mhz;
676		set.dev = NULL;
677		error = cpufreq_insert_abs(sc, &set, 1);
678		if (error)
679			goto out;
680	}
681
682	/* Create a combined list of absolute + relative levels. */
683	TAILQ_FOREACH(set_arr, &rel_sets, link)
684		cpufreq_expand_set(sc, set_arr);
685
686	/* If the caller doesn't have enough space, return the actual count. */
687	if (sc->all_count > *count) {
688		*count = sc->all_count;
689		error = E2BIG;
690		goto out;
691	}
692
693	/* Finally, output the list of levels. */
694	i = 0;
695	TAILQ_FOREACH(lev, &sc->all_levels, link) {
696		/* Skip levels that have a frequency that is too low. */
697		if (lev->total_set.freq < cf_lowest_freq) {
698			sc->all_count--;
699			continue;
700		}
701
702		levels[i] = *lev;
703		i++;
704	}
705	*count = sc->all_count;
706	error = 0;
707
708out:
709	/* Clear all levels since we regenerate them each time. */
710	while ((lev = TAILQ_FIRST(&sc->all_levels)) != NULL) {
711		TAILQ_REMOVE(&sc->all_levels, lev, link);
712		free(lev, M_TEMP);
713	}
714	sc->all_count = 0;
715
716	CF_MTX_UNLOCK(&sc->lock);
717	while ((set_arr = TAILQ_FIRST(&rel_sets)) != NULL) {
718		TAILQ_REMOVE(&rel_sets, set_arr, link);
719		free(set_arr, M_TEMP);
720	}
721	return (error);
722}
723
724/*
725 * Create levels for an array of absolute settings and insert them in
726 * sorted order in the specified list.
727 */
728static int
729cpufreq_insert_abs(struct cpufreq_softc *sc, struct cf_setting *sets,
730    int count)
731{
732	struct cf_level_lst *list;
733	struct cf_level *level, *search;
734	int i, inserted;
735
736	CF_MTX_ASSERT(&sc->lock);
737
738	list = &sc->all_levels;
739	for (i = 0; i < count; i++) {
740		level = malloc(sizeof(*level), M_TEMP, M_NOWAIT | M_ZERO);
741		if (level == NULL)
742			return (ENOMEM);
743		level->abs_set = sets[i];
744		level->total_set = sets[i];
745		level->total_set.dev = NULL;
746		sc->all_count++;
747		inserted = 0;
748
749		if (TAILQ_EMPTY(list)) {
750			CF_DEBUG("adding abs setting %d at head\n",
751			    sets[i].freq);
752			TAILQ_INSERT_HEAD(list, level, link);
753			continue;
754		}
755
756		TAILQ_FOREACH_REVERSE(search, list, cf_level_lst, link)
757			if (sets[i].freq <= search->total_set.freq) {
758				CF_DEBUG("adding abs setting %d after %d\n",
759				    sets[i].freq, search->total_set.freq);
760				TAILQ_INSERT_AFTER(list, search, level, link);
761				inserted = 1;
762				break;
763			}
764
765		if (inserted == 0) {
766			TAILQ_FOREACH(search, list, link)
767				if (sets[i].freq >= search->total_set.freq) {
768					CF_DEBUG("adding abs setting %d before %d\n",
769					    sets[i].freq, search->total_set.freq);
770					TAILQ_INSERT_BEFORE(search, level, link);
771					break;
772				}
773		}
774	}
775
776	return (0);
777}
778
779/*
780 * Expand a group of relative settings, creating derived levels from them.
781 */
782static int
783cpufreq_expand_set(struct cpufreq_softc *sc, struct cf_setting_array *set_arr)
784{
785	struct cf_level *fill, *search;
786	struct cf_setting *set;
787	int i;
788
789	CF_MTX_ASSERT(&sc->lock);
790
791	/*
792	 * Walk the set of all existing levels in reverse.  This is so we
793	 * create derived states from the lowest absolute settings first
794	 * and discard duplicates created from higher absolute settings.
795	 * For instance, a level of 50 Mhz derived from 100 Mhz + 50% is
796	 * preferable to 200 Mhz + 25% because absolute settings are more
797	 * efficient since they often change the voltage as well.
798	 */
799	TAILQ_FOREACH_REVERSE(search, &sc->all_levels, cf_level_lst, link) {
800		/* Add each setting to the level, duplicating if necessary. */
801		for (i = 0; i < set_arr->count; i++) {
802			set = &set_arr->sets[i];
803
804			/*
805			 * If this setting is less than 100%, split the level
806			 * into two and add this setting to the new level.
807			 */
808			fill = search;
809			if (set->freq < 10000) {
810				fill = cpufreq_dup_set(sc, search, set);
811
812				/*
813				 * The new level was a duplicate of an existing
814				 * level or its absolute setting is too high
815				 * so we freed it.  For example, we discard a
816				 * derived level of 1000 MHz/25% if a level
817				 * of 500 MHz/100% already exists.
818				 */
819				if (fill == NULL)
820					break;
821			}
822
823			/* Add this setting to the existing or new level. */
824			KASSERT(fill->rel_count < MAX_SETTINGS,
825			    ("cpufreq: too many relative drivers (%d)",
826			    MAX_SETTINGS));
827			fill->rel_set[fill->rel_count] = *set;
828			fill->rel_count++;
829			CF_DEBUG(
830			"expand set added rel setting %d%% to %d level\n",
831			    set->freq / 100, fill->total_set.freq);
832		}
833	}
834
835	return (0);
836}
837
838static struct cf_level *
839cpufreq_dup_set(struct cpufreq_softc *sc, struct cf_level *dup,
840    struct cf_setting *set)
841{
842	struct cf_level_lst *list;
843	struct cf_level *fill, *itr;
844	struct cf_setting *fill_set, *itr_set;
845	int i;
846
847	CF_MTX_ASSERT(&sc->lock);
848
849	/*
850	 * Create a new level, copy it from the old one, and update the
851	 * total frequency and power by the percentage specified in the
852	 * relative setting.
853	 */
854	fill = malloc(sizeof(*fill), M_TEMP, M_NOWAIT);
855	if (fill == NULL)
856		return (NULL);
857	*fill = *dup;
858	fill_set = &fill->total_set;
859	fill_set->freq =
860	    ((uint64_t)fill_set->freq * set->freq) / 10000;
861	if (fill_set->power != CPUFREQ_VAL_UNKNOWN) {
862		fill_set->power = ((uint64_t)fill_set->power * set->freq)
863		    / 10000;
864	}
865	if (set->lat != CPUFREQ_VAL_UNKNOWN) {
866		if (fill_set->lat != CPUFREQ_VAL_UNKNOWN)
867			fill_set->lat += set->lat;
868		else
869			fill_set->lat = set->lat;
870	}
871	CF_DEBUG("dup set considering derived setting %d\n", fill_set->freq);
872
873	/*
874	 * If we copied an old level that we already modified (say, at 100%),
875	 * we need to remove that setting before adding this one.  Since we
876	 * process each setting array in order, we know any settings for this
877	 * driver will be found at the end.
878	 */
879	for (i = fill->rel_count; i != 0; i--) {
880		if (fill->rel_set[i - 1].dev != set->dev)
881			break;
882		CF_DEBUG("removed last relative driver: %s\n",
883		    device_get_nameunit(set->dev));
884		fill->rel_count--;
885	}
886
887	/*
888	 * Insert the new level in sorted order.  If it is a duplicate of an
889	 * existing level (1) or has an absolute setting higher than the
890	 * existing level (2), do not add it.  We can do this since any such
891	 * level is guaranteed use less power.  For example (1), a level with
892	 * one absolute setting of 800 Mhz uses less power than one composed
893	 * of an absolute setting of 1600 Mhz and a relative setting at 50%.
894	 * Also for example (2), a level of 800 Mhz/75% is preferable to
895	 * 1600 Mhz/25% even though the latter has a lower total frequency.
896	 */
897	list = &sc->all_levels;
898	KASSERT(!TAILQ_EMPTY(list), ("all levels list empty in dup set"));
899	TAILQ_FOREACH_REVERSE(itr, list, cf_level_lst, link) {
900		itr_set = &itr->total_set;
901		if (CPUFREQ_CMP(fill_set->freq, itr_set->freq)) {
902			CF_DEBUG("dup set rejecting %d (dupe)\n",
903			    fill_set->freq);
904			itr = NULL;
905			break;
906		} else if (fill_set->freq < itr_set->freq) {
907			if (fill->abs_set.freq <= itr->abs_set.freq) {
908				CF_DEBUG(
909			"dup done, inserting new level %d after %d\n",
910				    fill_set->freq, itr_set->freq);
911				TAILQ_INSERT_AFTER(list, itr, fill, link);
912				sc->all_count++;
913			} else {
914				CF_DEBUG("dup set rejecting %d (abs too big)\n",
915				    fill_set->freq);
916				itr = NULL;
917			}
918			break;
919		}
920	}
921
922	/* We didn't find a good place for this new level so free it. */
923	if (itr == NULL) {
924		CF_DEBUG("dup set freeing new level %d (not optimal)\n",
925		    fill_set->freq);
926		free(fill, M_TEMP);
927		fill = NULL;
928	}
929
930	return (fill);
931}
932
933static int
934cpufreq_curr_sysctl(SYSCTL_HANDLER_ARGS)
935{
936	struct cpufreq_softc *sc;
937	struct cf_level *levels;
938	int best, count, diff, bdiff, devcount, error, freq, i, n;
939	device_t *devs;
940
941	devs = NULL;
942	sc = oidp->oid_arg1;
943	levels = sc->levels_buf;
944
945	error = CPUFREQ_GET(sc->dev, &levels[0]);
946	if (error)
947		goto out;
948	freq = levels[0].total_set.freq;
949	error = sysctl_handle_int(oidp, &freq, 0, req);
950	if (error != 0 || req->newptr == NULL)
951		goto out;
952
953	/*
954	 * While we only call cpufreq_get() on one device (assuming all
955	 * CPUs have equal levels), we call cpufreq_set() on all CPUs.
956	 * This is needed for some MP systems.
957	 */
958	error = devclass_get_devices(devclass_find("cpufreq"), &devs, &devcount);
959	if (error)
960		goto out;
961	for (n = 0; n < devcount; n++) {
962		count = CF_MAX_LEVELS;
963		error = CPUFREQ_LEVELS(devs[n], levels, &count);
964		if (error) {
965			if (error == E2BIG)
966				printf(
967			"cpufreq: need to increase CF_MAX_LEVELS\n");
968			break;
969		}
970		best = 0;
971		bdiff = 1 << 30;
972		for (i = 0; i < count; i++) {
973			diff = abs(levels[i].total_set.freq - freq);
974			if (diff < bdiff) {
975				bdiff = diff;
976				best = i;
977			}
978		}
979		error = CPUFREQ_SET(devs[n], &levels[best], CPUFREQ_PRIO_USER);
980	}
981
982out:
983	if (devs)
984		free(devs, M_TEMP);
985	return (error);
986}
987
988static int
989cpufreq_levels_sysctl(SYSCTL_HANDLER_ARGS)
990{
991	struct cpufreq_softc *sc;
992	struct cf_level *levels;
993	struct cf_setting *set;
994	struct sbuf sb;
995	int count, error, i;
996
997	sc = oidp->oid_arg1;
998	sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND);
999
1000	/* Get settings from the device and generate the output string. */
1001	count = CF_MAX_LEVELS;
1002	levels = sc->levels_buf;
1003	if (levels == NULL) {
1004		sbuf_delete(&sb);
1005		return (ENOMEM);
1006	}
1007	error = CPUFREQ_LEVELS(sc->dev, levels, &count);
1008	if (error) {
1009		if (error == E2BIG)
1010			printf("cpufreq: need to increase CF_MAX_LEVELS\n");
1011		goto out;
1012	}
1013	if (count) {
1014		for (i = 0; i < count; i++) {
1015			set = &levels[i].total_set;
1016			sbuf_printf(&sb, "%d/%d ", set->freq, set->power);
1017		}
1018	} else
1019		sbuf_cpy(&sb, "0");
1020	sbuf_trim(&sb);
1021	sbuf_finish(&sb);
1022	error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
1023
1024out:
1025	sbuf_delete(&sb);
1026	return (error);
1027}
1028
1029static int
1030cpufreq_settings_sysctl(SYSCTL_HANDLER_ARGS)
1031{
1032	device_t dev;
1033	struct cf_setting *sets;
1034	struct sbuf sb;
1035	int error, i, set_count;
1036
1037	dev = oidp->oid_arg1;
1038	sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND);
1039
1040	/* Get settings from the device and generate the output string. */
1041	set_count = MAX_SETTINGS;
1042	sets = malloc(set_count * sizeof(*sets), M_TEMP, M_NOWAIT);
1043	if (sets == NULL) {
1044		sbuf_delete(&sb);
1045		return (ENOMEM);
1046	}
1047	error = CPUFREQ_DRV_SETTINGS(dev, sets, &set_count);
1048	if (error)
1049		goto out;
1050	if (set_count) {
1051		for (i = 0; i < set_count; i++)
1052			sbuf_printf(&sb, "%d/%d ", sets[i].freq, sets[i].power);
1053	} else
1054		sbuf_cpy(&sb, "0");
1055	sbuf_trim(&sb);
1056	sbuf_finish(&sb);
1057	error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
1058
1059out:
1060	free(sets, M_TEMP);
1061	sbuf_delete(&sb);
1062	return (error);
1063}
1064
1065static void
1066cpufreq_add_freq_driver_sysctl(device_t cf_dev)
1067{
1068	struct cpufreq_softc *sc;
1069
1070	sc = device_get_softc(cf_dev);
1071	SYSCTL_ADD_CONST_STRING(&sc->sysctl_ctx,
1072	    SYSCTL_CHILDREN(device_get_sysctl_tree(cf_dev)), OID_AUTO,
1073	    "freq_driver", CTLFLAG_RD, device_get_nameunit(sc->cf_drv_dev),
1074	    "cpufreq driver used by this cpu");
1075}
1076
1077int
1078cpufreq_register(device_t dev)
1079{
1080	struct cpufreq_softc *sc;
1081	device_t cf_dev, cpu_dev;
1082	int error;
1083
1084	/* Add a sysctl to get each driver's settings separately. */
1085	SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
1086	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
1087	    OID_AUTO, "freq_settings",
1088	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, dev, 0,
1089	    cpufreq_settings_sysctl, "A", "CPU frequency driver settings");
1090
1091	/*
1092	 * Add only one cpufreq device to each CPU.  Currently, all CPUs
1093	 * must offer the same levels and be switched at the same time.
1094	 */
1095	cpu_dev = device_get_parent(dev);
1096	if ((cf_dev = device_find_child(cpu_dev, "cpufreq", -1))) {
1097		sc = device_get_softc(cf_dev);
1098		sc->max_mhz = CPUFREQ_VAL_UNKNOWN;
1099		MPASS(sc->cf_drv_dev != NULL);
1100		return (0);
1101	}
1102
1103	/* Add the child device and possibly sysctls. */
1104	cf_dev = BUS_ADD_CHILD(cpu_dev, 0, "cpufreq", device_get_unit(cpu_dev));
1105	if (cf_dev == NULL)
1106		return (ENOMEM);
1107	device_quiet(cf_dev);
1108
1109	error = device_probe_and_attach(cf_dev);
1110	if (error)
1111		return (error);
1112
1113	sc = device_get_softc(cf_dev);
1114	sc->cf_drv_dev = dev;
1115	cpufreq_add_freq_driver_sysctl(cf_dev);
1116	return (error);
1117}
1118
1119int
1120cpufreq_unregister(device_t dev)
1121{
1122	device_t cf_dev;
1123	struct cpufreq_softc *sc __diagused;
1124
1125	/*
1126	 * If this is the last cpufreq child device, remove the control
1127	 * device as well.  We identify cpufreq children by calling a method
1128	 * they support.
1129	 */
1130	cf_dev = device_find_child(device_get_parent(dev), "cpufreq", -1);
1131	if (cf_dev == NULL) {
1132		device_printf(dev,
1133	"warning: cpufreq_unregister called with no cpufreq device active\n");
1134		return (0);
1135	}
1136	sc = device_get_softc(cf_dev);
1137	MPASS(sc->cf_drv_dev == dev);
1138	device_delete_child(device_get_parent(cf_dev), cf_dev);
1139
1140	return (0);
1141}
1142
1143int
1144cpufreq_settings_changed(device_t dev)
1145{
1146
1147	EVENTHANDLER_INVOKE(cpufreq_levels_changed,
1148	    device_get_unit(device_get_parent(dev)));
1149	return (0);
1150}
1151