1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2018 Intel Corporation
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted providing that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
19 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
23 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
24 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 * POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/types.h>
29#include <sys/param.h>
30#include <sys/sbuf.h>
31#include <sys/module.h>
32#include <sys/systm.h>
33#include <sys/errno.h>
34#include <sys/param.h>
35#include <sys/kernel.h>
36#include <sys/bus.h>
37#include <sys/cpu.h>
38#include <sys/smp.h>
39#include <sys/proc.h>
40#include <sys/sched.h>
41
42#include <machine/cpu.h>
43#include <machine/md_var.h>
44#include <machine/cputypes.h>
45#include <machine/specialreg.h>
46
47#include <contrib/dev/acpica/include/acpi.h>
48
49#include <dev/acpica/acpivar.h>
50
51#include <x86/cpufreq/hwpstate_intel_internal.h>
52
53#include "acpi_if.h"
54#include "cpufreq_if.h"
55
56extern uint64_t	tsc_freq;
57
58static int	intel_hwpstate_probe(device_t dev);
59static int	intel_hwpstate_attach(device_t dev);
60static int	intel_hwpstate_detach(device_t dev);
61static int	intel_hwpstate_suspend(device_t dev);
62static int	intel_hwpstate_resume(device_t dev);
63
64static int      intel_hwpstate_get(device_t dev, struct cf_setting *cf);
65static int      intel_hwpstate_type(device_t dev, int *type);
66
67static device_method_t intel_hwpstate_methods[] = {
68	/* Device interface */
69	DEVMETHOD(device_identify,	intel_hwpstate_identify),
70	DEVMETHOD(device_probe,		intel_hwpstate_probe),
71	DEVMETHOD(device_attach,	intel_hwpstate_attach),
72	DEVMETHOD(device_detach,	intel_hwpstate_detach),
73	DEVMETHOD(device_suspend,	intel_hwpstate_suspend),
74	DEVMETHOD(device_resume,	intel_hwpstate_resume),
75
76	/* cpufreq interface */
77	DEVMETHOD(cpufreq_drv_get,      intel_hwpstate_get),
78	DEVMETHOD(cpufreq_drv_type,     intel_hwpstate_type),
79
80	DEVMETHOD_END
81};
82
83struct hwp_softc {
84	device_t		dev;
85	bool 			hwp_notifications;
86	bool			hwp_activity_window;
87	bool			hwp_pref_ctrl;
88	bool			hwp_pkg_ctrl;
89	bool			hwp_pkg_ctrl_en;
90	bool			hwp_perf_bias;
91	bool			hwp_perf_bias_cached;
92
93	uint64_t		req; /* Cached copy of HWP_REQUEST */
94	uint64_t		hwp_energy_perf_bias;	/* Cache PERF_BIAS */
95
96	uint8_t			high;
97	uint8_t			guaranteed;
98	uint8_t			efficient;
99	uint8_t			low;
100};
101
102static driver_t hwpstate_intel_driver = {
103	"hwpstate_intel",
104	intel_hwpstate_methods,
105	sizeof(struct hwp_softc),
106};
107
108DRIVER_MODULE(hwpstate_intel, cpu, hwpstate_intel_driver, NULL, NULL);
109MODULE_VERSION(hwpstate_intel, 1);
110
111static bool hwpstate_pkg_ctrl_enable = true;
112SYSCTL_BOOL(_machdep, OID_AUTO, hwpstate_pkg_ctrl, CTLFLAG_RDTUN,
113    &hwpstate_pkg_ctrl_enable, 0,
114    "Set 1 (default) to enable package-level control, 0 to disable");
115
116static int
117intel_hwp_dump_sysctl_handler(SYSCTL_HANDLER_ARGS)
118{
119	device_t dev;
120	struct pcpu *pc;
121	struct sbuf *sb;
122	struct hwp_softc *sc;
123	uint64_t data, data2;
124	int ret;
125
126	sc = (struct hwp_softc *)arg1;
127	dev = sc->dev;
128
129	pc = cpu_get_pcpu(dev);
130	if (pc == NULL)
131		return (ENXIO);
132
133	sb = sbuf_new(NULL, NULL, 1024, SBUF_FIXEDLEN | SBUF_INCLUDENUL);
134	sbuf_putc(sb, '\n');
135	thread_lock(curthread);
136	sched_bind(curthread, pc->pc_cpuid);
137	thread_unlock(curthread);
138
139	rdmsr_safe(MSR_IA32_PM_ENABLE, &data);
140	sbuf_printf(sb, "CPU%d: HWP %sabled\n", pc->pc_cpuid,
141	    ((data & 1) ? "En" : "Dis"));
142
143	if (data == 0) {
144		ret = 0;
145		goto out;
146	}
147
148	rdmsr_safe(MSR_IA32_HWP_CAPABILITIES, &data);
149	sbuf_printf(sb, "\tHighest Performance: %03ju\n", data & 0xff);
150	sbuf_printf(sb, "\tGuaranteed Performance: %03ju\n", (data >> 8) & 0xff);
151	sbuf_printf(sb, "\tEfficient Performance: %03ju\n", (data >> 16) & 0xff);
152	sbuf_printf(sb, "\tLowest Performance: %03ju\n", (data >> 24) & 0xff);
153
154	rdmsr_safe(MSR_IA32_HWP_REQUEST, &data);
155	data2 = 0;
156	if (sc->hwp_pkg_ctrl && (data & IA32_HWP_REQUEST_PACKAGE_CONTROL))
157		rdmsr_safe(MSR_IA32_HWP_REQUEST_PKG, &data2);
158
159	sbuf_putc(sb, '\n');
160
161#define pkg_print(x, name, offset) do {					\
162	if (!sc->hwp_pkg_ctrl || (data & x) != 0) 			\
163		sbuf_printf(sb, "\t%s: %03u\n", name,			\
164		    (unsigned)(data >> offset) & 0xff);			\
165	else								\
166		sbuf_printf(sb, "\t%s: %03u\n", name,			\
167		    (unsigned)(data2 >> offset) & 0xff);		\
168} while (0)
169
170	pkg_print(IA32_HWP_REQUEST_EPP_VALID,
171	    "Requested Efficiency Performance Preference", 24);
172	pkg_print(IA32_HWP_REQUEST_DESIRED_VALID,
173	    "Requested Desired Performance", 16);
174	pkg_print(IA32_HWP_REQUEST_MAXIMUM_VALID,
175	    "Requested Maximum Performance", 8);
176	pkg_print(IA32_HWP_REQUEST_MINIMUM_VALID,
177	    "Requested Minimum Performance", 0);
178#undef pkg_print
179
180	sbuf_putc(sb, '\n');
181
182out:
183	thread_lock(curthread);
184	sched_unbind(curthread);
185	thread_unlock(curthread);
186
187	ret = sbuf_finish(sb);
188	if (ret == 0)
189		ret = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb));
190	sbuf_delete(sb);
191
192	return (ret);
193}
194
195static inline int
196percent_to_raw(int x)
197{
198
199	MPASS(x <= 100 && x >= 0);
200	return (0xff * x / 100);
201}
202
203/*
204 * Given x * 10 in [0, 1000], round to the integer nearest x.
205 *
206 * This allows round-tripping nice human readable numbers through this
207 * interface.  Otherwise, user-provided percentages such as 25, 50, 75 get
208 * rounded down to 24, 49, and 74, which is a bit ugly.
209 */
210static inline int
211round10(int xtimes10)
212{
213	return ((xtimes10 + 5) / 10);
214}
215
216static inline int
217raw_to_percent(int x)
218{
219	MPASS(x <= 0xff && x >= 0);
220	return (round10(x * 1000 / 0xff));
221}
222
223/* Range of MSR_IA32_ENERGY_PERF_BIAS is more limited: 0-0xf. */
224static inline int
225percent_to_raw_perf_bias(int x)
226{
227	/*
228	 * Round up so that raw values present as nice round human numbers and
229	 * also round-trip to the same raw value.
230	 */
231	MPASS(x <= 100 && x >= 0);
232	return (((0xf * x) + 50) / 100);
233}
234
235static inline int
236raw_to_percent_perf_bias(int x)
237{
238	/* Rounding to nice human numbers despite a step interval of 6.67%. */
239	MPASS(x <= 0xf && x >= 0);
240	return (((x * 20) / 0xf) * 5);
241}
242
243static int
244sysctl_epp_select(SYSCTL_HANDLER_ARGS)
245{
246	struct hwp_softc *sc;
247	device_t dev;
248	struct pcpu *pc;
249	uint64_t epb;
250	uint32_t val;
251	int ret;
252
253	dev = oidp->oid_arg1;
254	sc = device_get_softc(dev);
255	if (!sc->hwp_pref_ctrl && !sc->hwp_perf_bias)
256		return (ENODEV);
257
258	pc = cpu_get_pcpu(dev);
259	if (pc == NULL)
260		return (ENXIO);
261
262	thread_lock(curthread);
263	sched_bind(curthread, pc->pc_cpuid);
264	thread_unlock(curthread);
265
266	if (sc->hwp_pref_ctrl) {
267		val = (sc->req & IA32_HWP_REQUEST_ENERGY_PERFORMANCE_PREFERENCE) >> 24;
268		val = raw_to_percent(val);
269	} else {
270		/*
271		 * If cpuid indicates EPP is not supported, the HWP controller
272		 * uses MSR_IA32_ENERGY_PERF_BIAS instead (Intel SDM ��14.4.4).
273		 * This register is per-core (but not HT).
274		 */
275		if (!sc->hwp_perf_bias_cached) {
276			ret = rdmsr_safe(MSR_IA32_ENERGY_PERF_BIAS, &epb);
277			if (ret)
278				goto out;
279			sc->hwp_energy_perf_bias = epb;
280			sc->hwp_perf_bias_cached = true;
281		}
282		val = sc->hwp_energy_perf_bias &
283		    IA32_ENERGY_PERF_BIAS_POLICY_HINT_MASK;
284		val = raw_to_percent_perf_bias(val);
285	}
286
287	MPASS(val >= 0 && val <= 100);
288
289	ret = sysctl_handle_int(oidp, &val, 0, req);
290	if (ret || req->newptr == NULL)
291		goto out;
292
293	if (val > 100) {
294		ret = EINVAL;
295		goto out;
296	}
297
298	if (sc->hwp_pref_ctrl) {
299		val = percent_to_raw(val);
300
301		sc->req =
302		    ((sc->req & ~IA32_HWP_REQUEST_ENERGY_PERFORMANCE_PREFERENCE)
303		    | (val << 24u));
304
305		if (sc->hwp_pkg_ctrl_en)
306			ret = wrmsr_safe(MSR_IA32_HWP_REQUEST_PKG, sc->req);
307		else
308			ret = wrmsr_safe(MSR_IA32_HWP_REQUEST, sc->req);
309	} else {
310		val = percent_to_raw_perf_bias(val);
311		MPASS((val & ~IA32_ENERGY_PERF_BIAS_POLICY_HINT_MASK) == 0);
312
313		sc->hwp_energy_perf_bias =
314		    ((sc->hwp_energy_perf_bias &
315		    ~IA32_ENERGY_PERF_BIAS_POLICY_HINT_MASK) | val);
316		ret = wrmsr_safe(MSR_IA32_ENERGY_PERF_BIAS,
317		    sc->hwp_energy_perf_bias);
318	}
319
320out:
321	thread_lock(curthread);
322	sched_unbind(curthread);
323	thread_unlock(curthread);
324
325	return (ret);
326}
327
328void
329intel_hwpstate_identify(driver_t *driver, device_t parent)
330{
331	if (device_find_child(parent, "hwpstate_intel", -1) != NULL)
332		return;
333
334	if (cpu_vendor_id != CPU_VENDOR_INTEL)
335		return;
336
337	if (resource_disabled("hwpstate_intel", 0))
338		return;
339
340	/*
341	 * Intel SDM 14.4.1 (HWP Programming Interfaces):
342	 *   Availability of HWP baseline resource and capability,
343	 *   CPUID.06H:EAX[bit 7]: If this bit is set, HWP provides several new
344	 *   architectural MSRs: IA32_PM_ENABLE, IA32_HWP_CAPABILITIES,
345	 *   IA32_HWP_REQUEST, IA32_HWP_STATUS.
346	 */
347	if ((cpu_power_eax & CPUTPM1_HWP) == 0)
348		return;
349
350	if (BUS_ADD_CHILD(parent, 10, "hwpstate_intel", device_get_unit(parent))
351	    == NULL)
352		device_printf(parent, "hwpstate_intel: add child failed\n");
353}
354
355static int
356intel_hwpstate_probe(device_t dev)
357{
358
359	device_set_desc(dev, "Intel Speed Shift");
360	return (BUS_PROBE_NOWILDCARD);
361}
362
363static int
364set_autonomous_hwp(struct hwp_softc *sc)
365{
366	struct pcpu *pc;
367	device_t dev;
368	uint64_t caps;
369	int ret;
370
371	dev = sc->dev;
372
373	pc = cpu_get_pcpu(dev);
374	if (pc == NULL)
375		return (ENXIO);
376
377	thread_lock(curthread);
378	sched_bind(curthread, pc->pc_cpuid);
379	thread_unlock(curthread);
380
381	/* XXX: Many MSRs aren't readable until feature is enabled */
382	ret = wrmsr_safe(MSR_IA32_PM_ENABLE, 1);
383	if (ret) {
384		/*
385		 * This is actually a package-level MSR, and only the first
386		 * write is not ignored.  So it is harmless to enable it across
387		 * all devices, and this allows us not to care especially in
388		 * which order cores (and packages) are probed.  This error
389		 * condition should not happen given we gate on the HWP CPUID
390		 * feature flag, if the Intel SDM is correct.
391		 */
392		device_printf(dev, "Failed to enable HWP for cpu%d (%d)\n",
393		    pc->pc_cpuid, ret);
394		goto out;
395	}
396
397	ret = rdmsr_safe(MSR_IA32_HWP_REQUEST, &sc->req);
398	if (ret) {
399		device_printf(dev,
400		    "Failed to read HWP request MSR for cpu%d (%d)\n",
401		    pc->pc_cpuid, ret);
402		goto out;
403	}
404
405	ret = rdmsr_safe(MSR_IA32_HWP_CAPABILITIES, &caps);
406	if (ret) {
407		device_printf(dev,
408		    "Failed to read HWP capabilities MSR for cpu%d (%d)\n",
409		    pc->pc_cpuid, ret);
410		goto out;
411	}
412
413	/*
414	 * High and low are static; "guaranteed" is dynamic; and efficient is
415	 * also dynamic.
416	 */
417	sc->high = IA32_HWP_CAPABILITIES_HIGHEST_PERFORMANCE(caps);
418	sc->guaranteed = IA32_HWP_CAPABILITIES_GUARANTEED_PERFORMANCE(caps);
419	sc->efficient = IA32_HWP_CAPABILITIES_EFFICIENT_PERFORMANCE(caps);
420	sc->low = IA32_HWP_CAPABILITIES_LOWEST_PERFORMANCE(caps);
421
422	/* hardware autonomous selection determines the performance target */
423	sc->req &= ~IA32_HWP_DESIRED_PERFORMANCE;
424
425	/* enable HW dynamic selection of window size */
426	sc->req &= ~IA32_HWP_ACTIVITY_WINDOW;
427
428	/* IA32_HWP_REQUEST.Minimum_Performance = IA32_HWP_CAPABILITIES.Lowest_Performance */
429	sc->req &= ~IA32_HWP_MINIMUM_PERFORMANCE;
430	sc->req |= sc->low;
431
432	/* IA32_HWP_REQUEST.Maximum_Performance = IA32_HWP_CAPABILITIES.Highest_Performance. */
433	sc->req &= ~IA32_HWP_REQUEST_MAXIMUM_PERFORMANCE;
434	sc->req |= sc->high << 8;
435
436	/* If supported, request package-level control for this CPU. */
437	if (sc->hwp_pkg_ctrl_en)
438		ret = wrmsr_safe(MSR_IA32_HWP_REQUEST, sc->req |
439		    IA32_HWP_REQUEST_PACKAGE_CONTROL);
440	else
441		ret = wrmsr_safe(MSR_IA32_HWP_REQUEST, sc->req);
442	if (ret) {
443		device_printf(dev,
444		    "Failed to setup%s autonomous HWP for cpu%d\n",
445		    sc->hwp_pkg_ctrl_en ? " PKG" : "", pc->pc_cpuid);
446		goto out;
447	}
448
449	/* If supported, write the PKG-wide control MSR. */
450	if (sc->hwp_pkg_ctrl_en) {
451		/*
452		 * "The structure of the IA32_HWP_REQUEST_PKG MSR
453		 * (package-level) is identical to the IA32_HWP_REQUEST MSR
454		 * with the exception of the Package Control field, which does
455		 * not exist." (Intel SDM ��14.4.4)
456		 */
457		ret = wrmsr_safe(MSR_IA32_HWP_REQUEST_PKG, sc->req);
458		if (ret) {
459			device_printf(dev,
460			    "Failed to set autonomous HWP for package\n");
461		}
462	}
463
464out:
465	thread_lock(curthread);
466	sched_unbind(curthread);
467	thread_unlock(curthread);
468
469	return (ret);
470}
471
472static int
473intel_hwpstate_attach(device_t dev)
474{
475	struct hwp_softc *sc;
476	int ret;
477
478	sc = device_get_softc(dev);
479	sc->dev = dev;
480
481	/* eax */
482	if (cpu_power_eax & CPUTPM1_HWP_NOTIFICATION)
483		sc->hwp_notifications = true;
484	if (cpu_power_eax & CPUTPM1_HWP_ACTIVITY_WINDOW)
485		sc->hwp_activity_window = true;
486	if (cpu_power_eax & CPUTPM1_HWP_PERF_PREF)
487		sc->hwp_pref_ctrl = true;
488	if (cpu_power_eax & CPUTPM1_HWP_PKG)
489		sc->hwp_pkg_ctrl = true;
490
491	/* Allow administrators to disable pkg-level control. */
492	sc->hwp_pkg_ctrl_en = (sc->hwp_pkg_ctrl && hwpstate_pkg_ctrl_enable);
493
494	/* ecx */
495	if (cpu_power_ecx & CPUID_PERF_BIAS)
496		sc->hwp_perf_bias = true;
497
498	ret = set_autonomous_hwp(sc);
499	if (ret)
500		return (ret);
501
502	SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
503	    SYSCTL_STATIC_CHILDREN(_debug), OID_AUTO, device_get_nameunit(dev),
504	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE,
505	    sc, 0, intel_hwp_dump_sysctl_handler, "A", "");
506
507	SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
508	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
509	    "epp", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, dev, 0,
510	    sysctl_epp_select, "I",
511	    "Efficiency/Performance Preference "
512	    "(range from 0, most performant, through 100, most efficient)");
513
514	return (cpufreq_register(dev));
515}
516
517static int
518intel_hwpstate_detach(device_t dev)
519{
520
521	return (cpufreq_unregister(dev));
522}
523
524static int
525intel_hwpstate_get(device_t dev, struct cf_setting *set)
526{
527	struct pcpu *pc;
528	uint64_t rate;
529	int ret;
530
531	if (set == NULL)
532		return (EINVAL);
533
534	pc = cpu_get_pcpu(dev);
535	if (pc == NULL)
536		return (ENXIO);
537
538	memset(set, CPUFREQ_VAL_UNKNOWN, sizeof(*set));
539	set->dev = dev;
540
541	ret = cpu_est_clockrate(pc->pc_cpuid, &rate);
542	if (ret == 0)
543		set->freq = rate / 1000000;
544
545	set->volts = CPUFREQ_VAL_UNKNOWN;
546	set->power = CPUFREQ_VAL_UNKNOWN;
547	set->lat = CPUFREQ_VAL_UNKNOWN;
548
549	return (0);
550}
551
552static int
553intel_hwpstate_type(device_t dev, int *type)
554{
555	if (type == NULL)
556		return (EINVAL);
557	*type = CPUFREQ_TYPE_ABSOLUTE | CPUFREQ_FLAG_INFO_ONLY | CPUFREQ_FLAG_UNCACHED;
558
559	return (0);
560}
561
562static int
563intel_hwpstate_suspend(device_t dev)
564{
565	return (0);
566}
567
568/*
569 * Redo a subset of set_autonomous_hwp on resume; untested.  Without this,
570 * testers observed that on resume MSR_IA32_HWP_REQUEST was bogus.
571 */
572static int
573intel_hwpstate_resume(device_t dev)
574{
575	struct hwp_softc *sc;
576	struct pcpu *pc;
577	int ret;
578
579	sc = device_get_softc(dev);
580
581	pc = cpu_get_pcpu(dev);
582	if (pc == NULL)
583		return (ENXIO);
584
585	thread_lock(curthread);
586	sched_bind(curthread, pc->pc_cpuid);
587	thread_unlock(curthread);
588
589	ret = wrmsr_safe(MSR_IA32_PM_ENABLE, 1);
590	if (ret) {
591		device_printf(dev,
592		    "Failed to enable HWP for cpu%d after suspend (%d)\n",
593		    pc->pc_cpuid, ret);
594		goto out;
595	}
596
597	if (sc->hwp_pkg_ctrl_en)
598		ret = wrmsr_safe(MSR_IA32_HWP_REQUEST, sc->req |
599		    IA32_HWP_REQUEST_PACKAGE_CONTROL);
600	else
601		ret = wrmsr_safe(MSR_IA32_HWP_REQUEST, sc->req);
602	if (ret) {
603		device_printf(dev,
604		    "Failed to set%s autonomous HWP for cpu%d after suspend\n",
605		    sc->hwp_pkg_ctrl_en ? " PKG" : "", pc->pc_cpuid);
606		goto out;
607	}
608	if (sc->hwp_pkg_ctrl_en) {
609		ret = wrmsr_safe(MSR_IA32_HWP_REQUEST_PKG, sc->req);
610		if (ret) {
611			device_printf(dev,
612			    "Failed to set autonomous HWP for package after "
613			    "suspend\n");
614			goto out;
615		}
616	}
617	if (!sc->hwp_pref_ctrl && sc->hwp_perf_bias_cached) {
618		ret = wrmsr_safe(MSR_IA32_ENERGY_PERF_BIAS,
619		    sc->hwp_energy_perf_bias);
620		if (ret) {
621			device_printf(dev,
622			    "Failed to set energy perf bias for cpu%d after "
623			    "suspend\n", pc->pc_cpuid);
624		}
625	}
626
627out:
628	thread_lock(curthread);
629	sched_unbind(curthread);
630	thread_unlock(curthread);
631
632	return (ret);
633}
634