1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * x86_pkg_temp_thermal driver
4 * Copyright (c) 2013, Intel Corporation.
5 */
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8#include <linux/module.h>
9#include <linux/init.h>
10#include <linux/intel_tcc.h>
11#include <linux/err.h>
12#include <linux/param.h>
13#include <linux/device.h>
14#include <linux/platform_device.h>
15#include <linux/cpu.h>
16#include <linux/smp.h>
17#include <linux/slab.h>
18#include <linux/pm.h>
19#include <linux/thermal.h>
20#include <linux/debugfs.h>
21
22#include <asm/cpu_device_id.h>
23
24#include "thermal_interrupt.h"
25
26/*
27* Rate control delay: Idea is to introduce denounce effect
28* This should be long enough to avoid reduce events, when
29* threshold is set to a temperature, which is constantly
30* violated, but at the short enough to take any action.
31* The action can be remove threshold or change it to next
32* interesting setting. Based on experiments, in around
33* every 5 seconds under load will give us a significant
34* temperature change.
35*/
36#define PKG_TEMP_THERMAL_NOTIFY_DELAY	5000
37static int notify_delay_ms = PKG_TEMP_THERMAL_NOTIFY_DELAY;
38module_param(notify_delay_ms, int, 0644);
39MODULE_PARM_DESC(notify_delay_ms,
40	"User space notification delay in milli seconds.");
41
42/* Number of trip points in thermal zone. Currently it can't
43* be more than 2. MSR can allow setting and getting notifications
44* for only 2 thresholds. This define enforces this, if there
45* is some wrong values returned by cpuid for number of thresholds.
46*/
47#define MAX_NUMBER_OF_TRIPS	2
48
49struct zone_device {
50	int				cpu;
51	bool				work_scheduled;
52	u32				msr_pkg_therm_low;
53	u32				msr_pkg_therm_high;
54	struct delayed_work		work;
55	struct thermal_zone_device	*tzone;
56	struct cpumask			cpumask;
57};
58
59static struct thermal_zone_params pkg_temp_tz_params = {
60	.no_hwmon	= true,
61};
62
63/* Keep track of how many zone pointers we allocated in init() */
64static int max_id __read_mostly;
65/* Array of zone pointers */
66static struct zone_device **zones;
67/* Serializes interrupt notification, work and hotplug */
68static DEFINE_RAW_SPINLOCK(pkg_temp_lock);
69/* Protects zone operation in the work function against hotplug removal */
70static DEFINE_MUTEX(thermal_zone_mutex);
71
72/* The dynamically assigned cpu hotplug state for module_exit() */
73static enum cpuhp_state pkg_thermal_hp_state __read_mostly;
74
75/* Debug counters to show using debugfs */
76static struct dentry *debugfs;
77static unsigned int pkg_interrupt_cnt;
78static unsigned int pkg_work_cnt;
79
80static void pkg_temp_debugfs_init(void)
81{
82	debugfs = debugfs_create_dir("pkg_temp_thermal", NULL);
83
84	debugfs_create_u32("pkg_thres_interrupt", S_IRUGO, debugfs,
85			   &pkg_interrupt_cnt);
86	debugfs_create_u32("pkg_thres_work", S_IRUGO, debugfs,
87			   &pkg_work_cnt);
88}
89
90/*
91 * Protection:
92 *
93 * - cpu hotplug: Read serialized by cpu hotplug lock
94 *		  Write must hold pkg_temp_lock
95 *
96 * - Other callsites: Must hold pkg_temp_lock
97 */
98static struct zone_device *pkg_temp_thermal_get_dev(unsigned int cpu)
99{
100	int id = topology_logical_die_id(cpu);
101
102	if (id >= 0 && id < max_id)
103		return zones[id];
104	return NULL;
105}
106
107static int sys_get_curr_temp(struct thermal_zone_device *tzd, int *temp)
108{
109	struct zone_device *zonedev = thermal_zone_device_priv(tzd);
110	int val, ret;
111
112	ret = intel_tcc_get_temp(zonedev->cpu, &val, true);
113	if (ret < 0)
114		return ret;
115
116	*temp = val * 1000;
117	pr_debug("sys_get_curr_temp %d\n", *temp);
118	return 0;
119}
120
121static int
122sys_set_trip_temp(struct thermal_zone_device *tzd, int trip, int temp)
123{
124	struct zone_device *zonedev = thermal_zone_device_priv(tzd);
125	u32 l, h, mask, shift, intr;
126	int tj_max, val, ret;
127
128	tj_max = intel_tcc_get_tjmax(zonedev->cpu);
129	if (tj_max < 0)
130		return tj_max;
131	tj_max *= 1000;
132
133	val = (tj_max - temp)/1000;
134
135	if (trip >= MAX_NUMBER_OF_TRIPS || val < 0 || val > 0x7f)
136		return -EINVAL;
137
138	ret = rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
139			   &l, &h);
140	if (ret < 0)
141		return ret;
142
143	if (trip) {
144		mask = THERM_MASK_THRESHOLD1;
145		shift = THERM_SHIFT_THRESHOLD1;
146		intr = THERM_INT_THRESHOLD1_ENABLE;
147	} else {
148		mask = THERM_MASK_THRESHOLD0;
149		shift = THERM_SHIFT_THRESHOLD0;
150		intr = THERM_INT_THRESHOLD0_ENABLE;
151	}
152	l &= ~mask;
153	/*
154	* When users space sets a trip temperature == 0, which is indication
155	* that, it is no longer interested in receiving notifications.
156	*/
157	if (!temp) {
158		l &= ~intr;
159	} else {
160		l |= val << shift;
161		l |= intr;
162	}
163
164	return wrmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
165			l, h);
166}
167
168/* Thermal zone callback registry */
169static const struct thermal_zone_device_ops tzone_ops = {
170	.get_temp = sys_get_curr_temp,
171	.set_trip_temp = sys_set_trip_temp,
172};
173
174static bool pkg_thermal_rate_control(void)
175{
176	return true;
177}
178
179/* Enable threshold interrupt on local package/cpu */
180static inline void enable_pkg_thres_interrupt(void)
181{
182	u8 thres_0, thres_1;
183	u32 l, h;
184
185	rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
186	/* only enable/disable if it had valid threshold value */
187	thres_0 = (l & THERM_MASK_THRESHOLD0) >> THERM_SHIFT_THRESHOLD0;
188	thres_1 = (l & THERM_MASK_THRESHOLD1) >> THERM_SHIFT_THRESHOLD1;
189	if (thres_0)
190		l |= THERM_INT_THRESHOLD0_ENABLE;
191	if (thres_1)
192		l |= THERM_INT_THRESHOLD1_ENABLE;
193	wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
194}
195
196/* Disable threshold interrupt on local package/cpu */
197static inline void disable_pkg_thres_interrupt(void)
198{
199	u32 l, h;
200
201	rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
202
203	l &= ~(THERM_INT_THRESHOLD0_ENABLE | THERM_INT_THRESHOLD1_ENABLE);
204	wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
205}
206
207static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
208{
209	struct thermal_zone_device *tzone = NULL;
210	int cpu = smp_processor_id();
211	struct zone_device *zonedev;
212
213	mutex_lock(&thermal_zone_mutex);
214	raw_spin_lock_irq(&pkg_temp_lock);
215	++pkg_work_cnt;
216
217	zonedev = pkg_temp_thermal_get_dev(cpu);
218	if (!zonedev) {
219		raw_spin_unlock_irq(&pkg_temp_lock);
220		mutex_unlock(&thermal_zone_mutex);
221		return;
222	}
223	zonedev->work_scheduled = false;
224
225	thermal_clear_package_intr_status(PACKAGE_LEVEL, THERM_LOG_THRESHOLD0 | THERM_LOG_THRESHOLD1);
226	tzone = zonedev->tzone;
227
228	enable_pkg_thres_interrupt();
229	raw_spin_unlock_irq(&pkg_temp_lock);
230
231	/*
232	 * If tzone is not NULL, then thermal_zone_mutex will prevent the
233	 * concurrent removal in the cpu offline callback.
234	 */
235	if (tzone)
236		thermal_zone_device_update(tzone, THERMAL_EVENT_UNSPECIFIED);
237
238	mutex_unlock(&thermal_zone_mutex);
239}
240
241static void pkg_thermal_schedule_work(int cpu, struct delayed_work *work)
242{
243	unsigned long ms = msecs_to_jiffies(notify_delay_ms);
244
245	schedule_delayed_work_on(cpu, work, ms);
246}
247
248static int pkg_thermal_notify(u64 msr_val)
249{
250	int cpu = smp_processor_id();
251	struct zone_device *zonedev;
252	unsigned long flags;
253
254	raw_spin_lock_irqsave(&pkg_temp_lock, flags);
255	++pkg_interrupt_cnt;
256
257	disable_pkg_thres_interrupt();
258
259	/* Work is per package, so scheduling it once is enough. */
260	zonedev = pkg_temp_thermal_get_dev(cpu);
261	if (zonedev && !zonedev->work_scheduled) {
262		zonedev->work_scheduled = true;
263		pkg_thermal_schedule_work(zonedev->cpu, &zonedev->work);
264	}
265
266	raw_spin_unlock_irqrestore(&pkg_temp_lock, flags);
267	return 0;
268}
269
270static int pkg_temp_thermal_trips_init(int cpu, int tj_max,
271				       struct thermal_trip *trips, int num_trips)
272{
273	unsigned long thres_reg_value;
274	u32 mask, shift, eax, edx;
275	int ret, i;
276
277	for (i = 0; i < num_trips; i++) {
278
279		if (i) {
280			mask = THERM_MASK_THRESHOLD1;
281			shift = THERM_SHIFT_THRESHOLD1;
282		} else {
283			mask = THERM_MASK_THRESHOLD0;
284			shift = THERM_SHIFT_THRESHOLD0;
285		}
286
287		ret = rdmsr_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
288				   &eax, &edx);
289		if (ret < 0)
290			return ret;
291
292		thres_reg_value = (eax & mask) >> shift;
293
294		trips[i].temperature = thres_reg_value ?
295			tj_max - thres_reg_value * 1000 : THERMAL_TEMP_INVALID;
296
297		trips[i].type = THERMAL_TRIP_PASSIVE;
298		trips[i].flags |= THERMAL_TRIP_FLAG_RW_TEMP;
299
300		pr_debug("%s: cpu=%d, trip=%d, temp=%d\n",
301			 __func__, cpu, i, trips[i].temperature);
302	}
303
304	return 0;
305}
306
307static int pkg_temp_thermal_device_add(unsigned int cpu)
308{
309	struct thermal_trip trips[MAX_NUMBER_OF_TRIPS] = { 0 };
310	int id = topology_logical_die_id(cpu);
311	u32 eax, ebx, ecx, edx;
312	struct zone_device *zonedev;
313	int thres_count, err;
314	int tj_max;
315
316	if (id >= max_id)
317		return -ENOMEM;
318
319	cpuid(6, &eax, &ebx, &ecx, &edx);
320	thres_count = ebx & 0x07;
321	if (!thres_count)
322		return -ENODEV;
323
324	thres_count = clamp_val(thres_count, 0, MAX_NUMBER_OF_TRIPS);
325
326	tj_max = intel_tcc_get_tjmax(cpu);
327	if (tj_max < 0)
328		return tj_max;
329
330	zonedev = kzalloc(sizeof(*zonedev), GFP_KERNEL);
331	if (!zonedev)
332		return -ENOMEM;
333
334	err = pkg_temp_thermal_trips_init(cpu, tj_max, trips, thres_count);
335	if (err)
336		goto out_kfree_zonedev;
337
338	INIT_DELAYED_WORK(&zonedev->work, pkg_temp_thermal_threshold_work_fn);
339	zonedev->cpu = cpu;
340	zonedev->tzone = thermal_zone_device_register_with_trips("x86_pkg_temp",
341			trips, thres_count,
342			zonedev, &tzone_ops, &pkg_temp_tz_params, 0, 0);
343	if (IS_ERR(zonedev->tzone)) {
344		err = PTR_ERR(zonedev->tzone);
345		goto out_kfree_zonedev;
346	}
347	err = thermal_zone_device_enable(zonedev->tzone);
348	if (err)
349		goto out_unregister_tz;
350
351	/* Store MSR value for package thermal interrupt, to restore at exit */
352	rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, zonedev->msr_pkg_therm_low,
353	      zonedev->msr_pkg_therm_high);
354
355	cpumask_set_cpu(cpu, &zonedev->cpumask);
356	raw_spin_lock_irq(&pkg_temp_lock);
357	zones[id] = zonedev;
358	raw_spin_unlock_irq(&pkg_temp_lock);
359
360	return 0;
361
362out_unregister_tz:
363	thermal_zone_device_unregister(zonedev->tzone);
364out_kfree_zonedev:
365	kfree(zonedev);
366	return err;
367}
368
369static int pkg_thermal_cpu_offline(unsigned int cpu)
370{
371	struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu);
372	bool lastcpu, was_target;
373	int target;
374
375	if (!zonedev)
376		return 0;
377
378	target = cpumask_any_but(&zonedev->cpumask, cpu);
379	cpumask_clear_cpu(cpu, &zonedev->cpumask);
380	lastcpu = target >= nr_cpu_ids;
381	/*
382	 * Remove the sysfs files, if this is the last cpu in the package
383	 * before doing further cleanups.
384	 */
385	if (lastcpu) {
386		struct thermal_zone_device *tzone = zonedev->tzone;
387
388		/*
389		 * We must protect against a work function calling
390		 * thermal_zone_update, after/while unregister. We null out
391		 * the pointer under the zone mutex, so the worker function
392		 * won't try to call.
393		 */
394		mutex_lock(&thermal_zone_mutex);
395		zonedev->tzone = NULL;
396		mutex_unlock(&thermal_zone_mutex);
397
398		thermal_zone_device_unregister(tzone);
399	}
400
401	/* Protect against work and interrupts */
402	raw_spin_lock_irq(&pkg_temp_lock);
403
404	/*
405	 * Check whether this cpu was the current target and store the new
406	 * one. When we drop the lock, then the interrupt notify function
407	 * will see the new target.
408	 */
409	was_target = zonedev->cpu == cpu;
410	zonedev->cpu = target;
411
412	/*
413	 * If this is the last CPU in the package remove the package
414	 * reference from the array and restore the interrupt MSR. When we
415	 * drop the lock neither the interrupt notify function nor the
416	 * worker will see the package anymore.
417	 */
418	if (lastcpu) {
419		zones[topology_logical_die_id(cpu)] = NULL;
420		/* After this point nothing touches the MSR anymore. */
421		wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
422		      zonedev->msr_pkg_therm_low, zonedev->msr_pkg_therm_high);
423	}
424
425	/*
426	 * Check whether there is work scheduled and whether the work is
427	 * targeted at the outgoing CPU.
428	 */
429	if (zonedev->work_scheduled && was_target) {
430		/*
431		 * To cancel the work we need to drop the lock, otherwise
432		 * we might deadlock if the work needs to be flushed.
433		 */
434		raw_spin_unlock_irq(&pkg_temp_lock);
435		cancel_delayed_work_sync(&zonedev->work);
436		raw_spin_lock_irq(&pkg_temp_lock);
437		/*
438		 * If this is not the last cpu in the package and the work
439		 * did not run after we dropped the lock above, then we
440		 * need to reschedule the work, otherwise the interrupt
441		 * stays disabled forever.
442		 */
443		if (!lastcpu && zonedev->work_scheduled)
444			pkg_thermal_schedule_work(target, &zonedev->work);
445	}
446
447	raw_spin_unlock_irq(&pkg_temp_lock);
448
449	/* Final cleanup if this is the last cpu */
450	if (lastcpu)
451		kfree(zonedev);
452
453	return 0;
454}
455
456static int pkg_thermal_cpu_online(unsigned int cpu)
457{
458	struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu);
459	struct cpuinfo_x86 *c = &cpu_data(cpu);
460
461	/* Paranoia check */
462	if (!cpu_has(c, X86_FEATURE_DTHERM) || !cpu_has(c, X86_FEATURE_PTS))
463		return -ENODEV;
464
465	/* If the package exists, nothing to do */
466	if (zonedev) {
467		cpumask_set_cpu(cpu, &zonedev->cpumask);
468		return 0;
469	}
470	return pkg_temp_thermal_device_add(cpu);
471}
472
473static const struct x86_cpu_id __initconst pkg_temp_thermal_ids[] = {
474	X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_PTS, NULL),
475	{}
476};
477MODULE_DEVICE_TABLE(x86cpu, pkg_temp_thermal_ids);
478
479static int __init pkg_temp_thermal_init(void)
480{
481	int ret;
482
483	if (!x86_match_cpu(pkg_temp_thermal_ids))
484		return -ENODEV;
485
486	max_id = topology_max_packages() * topology_max_dies_per_package();
487	zones = kcalloc(max_id, sizeof(struct zone_device *),
488			   GFP_KERNEL);
489	if (!zones)
490		return -ENOMEM;
491
492	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online",
493				pkg_thermal_cpu_online,	pkg_thermal_cpu_offline);
494	if (ret < 0)
495		goto err;
496
497	/* Store the state for module exit */
498	pkg_thermal_hp_state = ret;
499
500	platform_thermal_package_notify = pkg_thermal_notify;
501	platform_thermal_package_rate_control = pkg_thermal_rate_control;
502
503	 /* Don't care if it fails */
504	pkg_temp_debugfs_init();
505	return 0;
506
507err:
508	kfree(zones);
509	return ret;
510}
511module_init(pkg_temp_thermal_init)
512
513static void __exit pkg_temp_thermal_exit(void)
514{
515	platform_thermal_package_notify = NULL;
516	platform_thermal_package_rate_control = NULL;
517
518	cpuhp_remove_state(pkg_thermal_hp_state);
519	debugfs_remove_recursive(debugfs);
520	kfree(zones);
521}
522module_exit(pkg_temp_thermal_exit)
523
524MODULE_IMPORT_NS(INTEL_TCC);
525MODULE_DESCRIPTION("X86 PKG TEMP Thermal Driver");
526MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
527MODULE_LICENSE("GPL v2");
528