1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Support Intel IOMMU PerfMon
4 * Copyright(c) 2023 Intel Corporation.
5 */
6#define pr_fmt(fmt)	"DMAR: " fmt
7#define dev_fmt(fmt)	pr_fmt(fmt)
8
9#include <linux/dmar.h>
10#include "iommu.h"
11#include "perfmon.h"
12
13PMU_FORMAT_ATTR(event,		"config:0-27");		/* ES: Events Select */
14PMU_FORMAT_ATTR(event_group,	"config:28-31");	/* EGI: Event Group Index */
15
16static struct attribute *iommu_pmu_format_attrs[] = {
17	&format_attr_event_group.attr,
18	&format_attr_event.attr,
19	NULL
20};
21
22static struct attribute_group iommu_pmu_format_attr_group = {
23	.name = "format",
24	.attrs = iommu_pmu_format_attrs,
25};
26
27/* The available events are added in attr_update later */
28static struct attribute *attrs_empty[] = {
29	NULL
30};
31
32static struct attribute_group iommu_pmu_events_attr_group = {
33	.name = "events",
34	.attrs = attrs_empty,
35};
36
37static cpumask_t iommu_pmu_cpu_mask;
38
39static ssize_t
40cpumask_show(struct device *dev, struct device_attribute *attr, char *buf)
41{
42	return cpumap_print_to_pagebuf(true, buf, &iommu_pmu_cpu_mask);
43}
44static DEVICE_ATTR_RO(cpumask);
45
46static struct attribute *iommu_pmu_cpumask_attrs[] = {
47	&dev_attr_cpumask.attr,
48	NULL
49};
50
51static struct attribute_group iommu_pmu_cpumask_attr_group = {
52	.attrs = iommu_pmu_cpumask_attrs,
53};
54
55static const struct attribute_group *iommu_pmu_attr_groups[] = {
56	&iommu_pmu_format_attr_group,
57	&iommu_pmu_events_attr_group,
58	&iommu_pmu_cpumask_attr_group,
59	NULL
60};
61
62static inline struct iommu_pmu *dev_to_iommu_pmu(struct device *dev)
63{
64	/*
65	 * The perf_event creates its own dev for each PMU.
66	 * See pmu_dev_alloc()
67	 */
68	return container_of(dev_get_drvdata(dev), struct iommu_pmu, pmu);
69}
70
71#define IOMMU_PMU_ATTR(_name, _format, _filter)				\
72	PMU_FORMAT_ATTR(_name, _format);				\
73									\
74static struct attribute *_name##_attr[] = {				\
75	&format_attr_##_name.attr,					\
76	NULL								\
77};									\
78									\
79static umode_t								\
80_name##_is_visible(struct kobject *kobj, struct attribute *attr, int i)	\
81{									\
82	struct device *dev = kobj_to_dev(kobj);				\
83	struct iommu_pmu *iommu_pmu = dev_to_iommu_pmu(dev);		\
84									\
85	if (!iommu_pmu)							\
86		return 0;						\
87	return (iommu_pmu->filter & _filter) ? attr->mode : 0;		\
88}									\
89									\
90static struct attribute_group _name = {					\
91	.name		= "format",					\
92	.attrs		= _name##_attr,					\
93	.is_visible	= _name##_is_visible,				\
94};
95
96IOMMU_PMU_ATTR(filter_requester_id_en,	"config1:0",		IOMMU_PMU_FILTER_REQUESTER_ID);
97IOMMU_PMU_ATTR(filter_domain_en,	"config1:1",		IOMMU_PMU_FILTER_DOMAIN);
98IOMMU_PMU_ATTR(filter_pasid_en,		"config1:2",		IOMMU_PMU_FILTER_PASID);
99IOMMU_PMU_ATTR(filter_ats_en,		"config1:3",		IOMMU_PMU_FILTER_ATS);
100IOMMU_PMU_ATTR(filter_page_table_en,	"config1:4",		IOMMU_PMU_FILTER_PAGE_TABLE);
101IOMMU_PMU_ATTR(filter_requester_id,	"config1:16-31",	IOMMU_PMU_FILTER_REQUESTER_ID);
102IOMMU_PMU_ATTR(filter_domain,		"config1:32-47",	IOMMU_PMU_FILTER_DOMAIN);
103IOMMU_PMU_ATTR(filter_pasid,		"config2:0-21",		IOMMU_PMU_FILTER_PASID);
104IOMMU_PMU_ATTR(filter_ats,		"config2:24-28",	IOMMU_PMU_FILTER_ATS);
105IOMMU_PMU_ATTR(filter_page_table,	"config2:32-36",	IOMMU_PMU_FILTER_PAGE_TABLE);
106
107#define iommu_pmu_en_requester_id(e)		((e) & 0x1)
108#define iommu_pmu_en_domain(e)			(((e) >> 1) & 0x1)
109#define iommu_pmu_en_pasid(e)			(((e) >> 2) & 0x1)
110#define iommu_pmu_en_ats(e)			(((e) >> 3) & 0x1)
111#define iommu_pmu_en_page_table(e)		(((e) >> 4) & 0x1)
112#define iommu_pmu_get_requester_id(filter)	(((filter) >> 16) & 0xffff)
113#define iommu_pmu_get_domain(filter)		(((filter) >> 32) & 0xffff)
114#define iommu_pmu_get_pasid(filter)		((filter) & 0x3fffff)
115#define iommu_pmu_get_ats(filter)		(((filter) >> 24) & 0x1f)
116#define iommu_pmu_get_page_table(filter)	(((filter) >> 32) & 0x1f)
117
118#define iommu_pmu_set_filter(_name, _config, _filter, _idx, _econfig)		\
119{										\
120	if ((iommu_pmu->filter & _filter) && iommu_pmu_en_##_name(_econfig)) {	\
121		dmar_writel(iommu_pmu->cfg_reg + _idx * IOMMU_PMU_CFG_OFFSET +	\
122			    IOMMU_PMU_CFG_SIZE +				\
123			    (ffs(_filter) - 1) * IOMMU_PMU_CFG_FILTERS_OFFSET,	\
124			    iommu_pmu_get_##_name(_config) | IOMMU_PMU_FILTER_EN);\
125	}									\
126}
127
128#define iommu_pmu_clear_filter(_filter, _idx)					\
129{										\
130	if (iommu_pmu->filter & _filter) {					\
131		dmar_writel(iommu_pmu->cfg_reg + _idx * IOMMU_PMU_CFG_OFFSET +	\
132			    IOMMU_PMU_CFG_SIZE +				\
133			    (ffs(_filter) - 1) * IOMMU_PMU_CFG_FILTERS_OFFSET,	\
134			    0);							\
135	}									\
136}
137
138/*
139 * Define the event attr related functions
140 * Input: _name: event attr name
141 *        _string: string of the event in sysfs
142 *        _g_idx: event group encoding
143 *        _event: event encoding
144 */
145#define IOMMU_PMU_EVENT_ATTR(_name, _string, _g_idx, _event)			\
146	PMU_EVENT_ATTR_STRING(_name, event_attr_##_name, _string)		\
147										\
148static struct attribute *_name##_attr[] = {					\
149	&event_attr_##_name.attr.attr,						\
150	NULL									\
151};										\
152										\
153static umode_t									\
154_name##_is_visible(struct kobject *kobj, struct attribute *attr, int i)		\
155{										\
156	struct device *dev = kobj_to_dev(kobj);					\
157	struct iommu_pmu *iommu_pmu = dev_to_iommu_pmu(dev);			\
158										\
159	if (!iommu_pmu)								\
160		return 0;							\
161	return (iommu_pmu->evcap[_g_idx] & _event) ? attr->mode : 0;		\
162}										\
163										\
164static struct attribute_group _name = {						\
165	.name		= "events",						\
166	.attrs		= _name##_attr,						\
167	.is_visible	= _name##_is_visible,					\
168};
169
170IOMMU_PMU_EVENT_ATTR(iommu_clocks,		"event_group=0x0,event=0x001", 0x0, 0x001)
171IOMMU_PMU_EVENT_ATTR(iommu_requests,		"event_group=0x0,event=0x002", 0x0, 0x002)
172IOMMU_PMU_EVENT_ATTR(pw_occupancy,		"event_group=0x0,event=0x004", 0x0, 0x004)
173IOMMU_PMU_EVENT_ATTR(ats_blocked,		"event_group=0x0,event=0x008", 0x0, 0x008)
174IOMMU_PMU_EVENT_ATTR(iommu_mrds,		"event_group=0x1,event=0x001", 0x1, 0x001)
175IOMMU_PMU_EVENT_ATTR(iommu_mem_blocked,		"event_group=0x1,event=0x020", 0x1, 0x020)
176IOMMU_PMU_EVENT_ATTR(pg_req_posted,		"event_group=0x1,event=0x040", 0x1, 0x040)
177IOMMU_PMU_EVENT_ATTR(ctxt_cache_lookup,		"event_group=0x2,event=0x001", 0x2, 0x001)
178IOMMU_PMU_EVENT_ATTR(ctxt_cache_hit,		"event_group=0x2,event=0x002", 0x2, 0x002)
179IOMMU_PMU_EVENT_ATTR(pasid_cache_lookup,	"event_group=0x2,event=0x004", 0x2, 0x004)
180IOMMU_PMU_EVENT_ATTR(pasid_cache_hit,		"event_group=0x2,event=0x008", 0x2, 0x008)
181IOMMU_PMU_EVENT_ATTR(ss_nonleaf_lookup,		"event_group=0x2,event=0x010", 0x2, 0x010)
182IOMMU_PMU_EVENT_ATTR(ss_nonleaf_hit,		"event_group=0x2,event=0x020", 0x2, 0x020)
183IOMMU_PMU_EVENT_ATTR(fs_nonleaf_lookup,		"event_group=0x2,event=0x040", 0x2, 0x040)
184IOMMU_PMU_EVENT_ATTR(fs_nonleaf_hit,		"event_group=0x2,event=0x080", 0x2, 0x080)
185IOMMU_PMU_EVENT_ATTR(hpt_nonleaf_lookup,	"event_group=0x2,event=0x100", 0x2, 0x100)
186IOMMU_PMU_EVENT_ATTR(hpt_nonleaf_hit,		"event_group=0x2,event=0x200", 0x2, 0x200)
187IOMMU_PMU_EVENT_ATTR(iotlb_lookup,		"event_group=0x3,event=0x001", 0x3, 0x001)
188IOMMU_PMU_EVENT_ATTR(iotlb_hit,			"event_group=0x3,event=0x002", 0x3, 0x002)
189IOMMU_PMU_EVENT_ATTR(hpt_leaf_lookup,		"event_group=0x3,event=0x004", 0x3, 0x004)
190IOMMU_PMU_EVENT_ATTR(hpt_leaf_hit,		"event_group=0x3,event=0x008", 0x3, 0x008)
191IOMMU_PMU_EVENT_ATTR(int_cache_lookup,		"event_group=0x4,event=0x001", 0x4, 0x001)
192IOMMU_PMU_EVENT_ATTR(int_cache_hit_nonposted,	"event_group=0x4,event=0x002", 0x4, 0x002)
193IOMMU_PMU_EVENT_ATTR(int_cache_hit_posted,	"event_group=0x4,event=0x004", 0x4, 0x004)
194
195static const struct attribute_group *iommu_pmu_attr_update[] = {
196	&filter_requester_id_en,
197	&filter_domain_en,
198	&filter_pasid_en,
199	&filter_ats_en,
200	&filter_page_table_en,
201	&filter_requester_id,
202	&filter_domain,
203	&filter_pasid,
204	&filter_ats,
205	&filter_page_table,
206	&iommu_clocks,
207	&iommu_requests,
208	&pw_occupancy,
209	&ats_blocked,
210	&iommu_mrds,
211	&iommu_mem_blocked,
212	&pg_req_posted,
213	&ctxt_cache_lookup,
214	&ctxt_cache_hit,
215	&pasid_cache_lookup,
216	&pasid_cache_hit,
217	&ss_nonleaf_lookup,
218	&ss_nonleaf_hit,
219	&fs_nonleaf_lookup,
220	&fs_nonleaf_hit,
221	&hpt_nonleaf_lookup,
222	&hpt_nonleaf_hit,
223	&iotlb_lookup,
224	&iotlb_hit,
225	&hpt_leaf_lookup,
226	&hpt_leaf_hit,
227	&int_cache_lookup,
228	&int_cache_hit_nonposted,
229	&int_cache_hit_posted,
230	NULL
231};
232
233static inline void __iomem *
234iommu_event_base(struct iommu_pmu *iommu_pmu, int idx)
235{
236	return iommu_pmu->cntr_reg + idx * iommu_pmu->cntr_stride;
237}
238
239static inline void __iomem *
240iommu_config_base(struct iommu_pmu *iommu_pmu, int idx)
241{
242	return iommu_pmu->cfg_reg + idx * IOMMU_PMU_CFG_OFFSET;
243}
244
245static inline struct iommu_pmu *iommu_event_to_pmu(struct perf_event *event)
246{
247	return container_of(event->pmu, struct iommu_pmu, pmu);
248}
249
250static inline u64 iommu_event_config(struct perf_event *event)
251{
252	u64 config = event->attr.config;
253
254	return (iommu_event_select(config) << IOMMU_EVENT_CFG_ES_SHIFT) |
255	       (iommu_event_group(config) << IOMMU_EVENT_CFG_EGI_SHIFT) |
256	       IOMMU_EVENT_CFG_INT;
257}
258
259static inline bool is_iommu_pmu_event(struct iommu_pmu *iommu_pmu,
260				      struct perf_event *event)
261{
262	return event->pmu == &iommu_pmu->pmu;
263}
264
265static int iommu_pmu_validate_event(struct perf_event *event)
266{
267	struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
268	u32 event_group = iommu_event_group(event->attr.config);
269
270	if (event_group >= iommu_pmu->num_eg)
271		return -EINVAL;
272
273	return 0;
274}
275
276static int iommu_pmu_validate_group(struct perf_event *event)
277{
278	struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
279	struct perf_event *sibling;
280	int nr = 0;
281
282	/*
283	 * All events in a group must be scheduled simultaneously.
284	 * Check whether there is enough counters for all the events.
285	 */
286	for_each_sibling_event(sibling, event->group_leader) {
287		if (!is_iommu_pmu_event(iommu_pmu, sibling) ||
288		    sibling->state <= PERF_EVENT_STATE_OFF)
289			continue;
290
291		if (++nr > iommu_pmu->num_cntr)
292			return -EINVAL;
293	}
294
295	return 0;
296}
297
298static int iommu_pmu_event_init(struct perf_event *event)
299{
300	struct hw_perf_event *hwc = &event->hw;
301
302	if (event->attr.type != event->pmu->type)
303		return -ENOENT;
304
305	/* sampling not supported */
306	if (event->attr.sample_period)
307		return -EINVAL;
308
309	if (event->cpu < 0)
310		return -EINVAL;
311
312	if (iommu_pmu_validate_event(event))
313		return -EINVAL;
314
315	hwc->config = iommu_event_config(event);
316
317	return iommu_pmu_validate_group(event);
318}
319
320static void iommu_pmu_event_update(struct perf_event *event)
321{
322	struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
323	struct hw_perf_event *hwc = &event->hw;
324	u64 prev_count, new_count, delta;
325	int shift = 64 - iommu_pmu->cntr_width;
326
327again:
328	prev_count = local64_read(&hwc->prev_count);
329	new_count = dmar_readq(iommu_event_base(iommu_pmu, hwc->idx));
330	if (local64_xchg(&hwc->prev_count, new_count) != prev_count)
331		goto again;
332
333	/*
334	 * The counter width is enumerated. Always shift the counter
335	 * before using it.
336	 */
337	delta = (new_count << shift) - (prev_count << shift);
338	delta >>= shift;
339
340	local64_add(delta, &event->count);
341}
342
343static void iommu_pmu_start(struct perf_event *event, int flags)
344{
345	struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
346	struct intel_iommu *iommu = iommu_pmu->iommu;
347	struct hw_perf_event *hwc = &event->hw;
348	u64 count;
349
350	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
351		return;
352
353	if (WARN_ON_ONCE(hwc->idx < 0 || hwc->idx >= IOMMU_PMU_IDX_MAX))
354		return;
355
356	if (flags & PERF_EF_RELOAD)
357		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
358
359	hwc->state = 0;
360
361	/* Always reprogram the period */
362	count = dmar_readq(iommu_event_base(iommu_pmu, hwc->idx));
363	local64_set((&hwc->prev_count), count);
364
365	/*
366	 * The error of ecmd will be ignored.
367	 * - The existing perf_event subsystem doesn't handle the error.
368	 *   Only IOMMU PMU returns runtime HW error. We don't want to
369	 *   change the existing generic interfaces for the specific case.
370	 * - It's a corner case caused by HW, which is very unlikely to
371	 *   happen. There is nothing SW can do.
372	 * - The worst case is that the user will get <not count> with
373	 *   perf command, which can give the user some hints.
374	 */
375	ecmd_submit_sync(iommu, DMA_ECMD_ENABLE, hwc->idx, 0);
376
377	perf_event_update_userpage(event);
378}
379
380static void iommu_pmu_stop(struct perf_event *event, int flags)
381{
382	struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
383	struct intel_iommu *iommu = iommu_pmu->iommu;
384	struct hw_perf_event *hwc = &event->hw;
385
386	if (!(hwc->state & PERF_HES_STOPPED)) {
387		ecmd_submit_sync(iommu, DMA_ECMD_DISABLE, hwc->idx, 0);
388
389		iommu_pmu_event_update(event);
390
391		hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
392	}
393}
394
395static inline int
396iommu_pmu_validate_per_cntr_event(struct iommu_pmu *iommu_pmu,
397				  int idx, struct perf_event *event)
398{
399	u32 event_group = iommu_event_group(event->attr.config);
400	u32 select = iommu_event_select(event->attr.config);
401
402	if (!(iommu_pmu->cntr_evcap[idx][event_group] & select))
403		return -EINVAL;
404
405	return 0;
406}
407
408static int iommu_pmu_assign_event(struct iommu_pmu *iommu_pmu,
409				  struct perf_event *event)
410{
411	struct hw_perf_event *hwc = &event->hw;
412	int idx;
413
414	/*
415	 * The counters which support limited events are usually at the end.
416	 * Schedule them first to accommodate more events.
417	 */
418	for (idx = iommu_pmu->num_cntr - 1; idx >= 0; idx--) {
419		if (test_and_set_bit(idx, iommu_pmu->used_mask))
420			continue;
421		/* Check per-counter event capabilities */
422		if (!iommu_pmu_validate_per_cntr_event(iommu_pmu, idx, event))
423			break;
424		clear_bit(idx, iommu_pmu->used_mask);
425	}
426	if (idx < 0)
427		return -EINVAL;
428
429	iommu_pmu->event_list[idx] = event;
430	hwc->idx = idx;
431
432	/* config events */
433	dmar_writeq(iommu_config_base(iommu_pmu, idx), hwc->config);
434
435	iommu_pmu_set_filter(requester_id, event->attr.config1,
436			     IOMMU_PMU_FILTER_REQUESTER_ID, idx,
437			     event->attr.config1);
438	iommu_pmu_set_filter(domain, event->attr.config1,
439			     IOMMU_PMU_FILTER_DOMAIN, idx,
440			     event->attr.config1);
441	iommu_pmu_set_filter(pasid, event->attr.config2,
442			     IOMMU_PMU_FILTER_PASID, idx,
443			     event->attr.config1);
444	iommu_pmu_set_filter(ats, event->attr.config2,
445			     IOMMU_PMU_FILTER_ATS, idx,
446			     event->attr.config1);
447	iommu_pmu_set_filter(page_table, event->attr.config2,
448			     IOMMU_PMU_FILTER_PAGE_TABLE, idx,
449			     event->attr.config1);
450
451	return 0;
452}
453
454static int iommu_pmu_add(struct perf_event *event, int flags)
455{
456	struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
457	struct hw_perf_event *hwc = &event->hw;
458	int ret;
459
460	ret = iommu_pmu_assign_event(iommu_pmu, event);
461	if (ret < 0)
462		return ret;
463
464	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
465
466	if (flags & PERF_EF_START)
467		iommu_pmu_start(event, 0);
468
469	return 0;
470}
471
472static void iommu_pmu_del(struct perf_event *event, int flags)
473{
474	struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
475	int idx = event->hw.idx;
476
477	iommu_pmu_stop(event, PERF_EF_UPDATE);
478
479	iommu_pmu_clear_filter(IOMMU_PMU_FILTER_REQUESTER_ID, idx);
480	iommu_pmu_clear_filter(IOMMU_PMU_FILTER_DOMAIN, idx);
481	iommu_pmu_clear_filter(IOMMU_PMU_FILTER_PASID, idx);
482	iommu_pmu_clear_filter(IOMMU_PMU_FILTER_ATS, idx);
483	iommu_pmu_clear_filter(IOMMU_PMU_FILTER_PAGE_TABLE, idx);
484
485	iommu_pmu->event_list[idx] = NULL;
486	event->hw.idx = -1;
487	clear_bit(idx, iommu_pmu->used_mask);
488
489	perf_event_update_userpage(event);
490}
491
492static void iommu_pmu_enable(struct pmu *pmu)
493{
494	struct iommu_pmu *iommu_pmu = container_of(pmu, struct iommu_pmu, pmu);
495	struct intel_iommu *iommu = iommu_pmu->iommu;
496
497	ecmd_submit_sync(iommu, DMA_ECMD_UNFREEZE, 0, 0);
498}
499
500static void iommu_pmu_disable(struct pmu *pmu)
501{
502	struct iommu_pmu *iommu_pmu = container_of(pmu, struct iommu_pmu, pmu);
503	struct intel_iommu *iommu = iommu_pmu->iommu;
504
505	ecmd_submit_sync(iommu, DMA_ECMD_FREEZE, 0, 0);
506}
507
508static void iommu_pmu_counter_overflow(struct iommu_pmu *iommu_pmu)
509{
510	struct perf_event *event;
511	u64 status;
512	int i;
513
514	/*
515	 * Two counters may be overflowed very close. Always check
516	 * whether there are more to handle.
517	 */
518	while ((status = dmar_readq(iommu_pmu->overflow))) {
519		for_each_set_bit(i, (unsigned long *)&status, iommu_pmu->num_cntr) {
520			/*
521			 * Find the assigned event of the counter.
522			 * Accumulate the value into the event->count.
523			 */
524			event = iommu_pmu->event_list[i];
525			if (!event) {
526				pr_warn_once("Cannot find the assigned event for counter %d\n", i);
527				continue;
528			}
529			iommu_pmu_event_update(event);
530		}
531
532		dmar_writeq(iommu_pmu->overflow, status);
533	}
534}
535
536static irqreturn_t iommu_pmu_irq_handler(int irq, void *dev_id)
537{
538	struct intel_iommu *iommu = dev_id;
539
540	if (!dmar_readl(iommu->reg + DMAR_PERFINTRSTS_REG))
541		return IRQ_NONE;
542
543	iommu_pmu_counter_overflow(iommu->pmu);
544
545	/* Clear the status bit */
546	dmar_writel(iommu->reg + DMAR_PERFINTRSTS_REG, DMA_PERFINTRSTS_PIS);
547
548	return IRQ_HANDLED;
549}
550
551static int __iommu_pmu_register(struct intel_iommu *iommu)
552{
553	struct iommu_pmu *iommu_pmu = iommu->pmu;
554
555	iommu_pmu->pmu.name		= iommu->name;
556	iommu_pmu->pmu.task_ctx_nr	= perf_invalid_context;
557	iommu_pmu->pmu.event_init	= iommu_pmu_event_init;
558	iommu_pmu->pmu.pmu_enable	= iommu_pmu_enable;
559	iommu_pmu->pmu.pmu_disable	= iommu_pmu_disable;
560	iommu_pmu->pmu.add		= iommu_pmu_add;
561	iommu_pmu->pmu.del		= iommu_pmu_del;
562	iommu_pmu->pmu.start		= iommu_pmu_start;
563	iommu_pmu->pmu.stop		= iommu_pmu_stop;
564	iommu_pmu->pmu.read		= iommu_pmu_event_update;
565	iommu_pmu->pmu.attr_groups	= iommu_pmu_attr_groups;
566	iommu_pmu->pmu.attr_update	= iommu_pmu_attr_update;
567	iommu_pmu->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
568	iommu_pmu->pmu.module		= THIS_MODULE;
569
570	return perf_pmu_register(&iommu_pmu->pmu, iommu_pmu->pmu.name, -1);
571}
572
573static inline void __iomem *
574get_perf_reg_address(struct intel_iommu *iommu, u32 offset)
575{
576	u32 off = dmar_readl(iommu->reg + offset);
577
578	return iommu->reg + off;
579}
580
581int alloc_iommu_pmu(struct intel_iommu *iommu)
582{
583	struct iommu_pmu *iommu_pmu;
584	int i, j, ret;
585	u64 perfcap;
586	u32 cap;
587
588	if (!ecap_pms(iommu->ecap))
589		return 0;
590
591	/* The IOMMU PMU requires the ECMD support as well */
592	if (!cap_ecmds(iommu->cap))
593		return -ENODEV;
594
595	perfcap = dmar_readq(iommu->reg + DMAR_PERFCAP_REG);
596	/* The performance monitoring is not supported. */
597	if (!perfcap)
598		return -ENODEV;
599
600	/* Sanity check for the number of the counters and event groups */
601	if (!pcap_num_cntr(perfcap) || !pcap_num_event_group(perfcap))
602		return -ENODEV;
603
604	/* The interrupt on overflow is required */
605	if (!pcap_interrupt(perfcap))
606		return -ENODEV;
607
608	/* Check required Enhanced Command Capability */
609	if (!ecmd_has_pmu_essential(iommu))
610		return -ENODEV;
611
612	iommu_pmu = kzalloc(sizeof(*iommu_pmu), GFP_KERNEL);
613	if (!iommu_pmu)
614		return -ENOMEM;
615
616	iommu_pmu->num_cntr = pcap_num_cntr(perfcap);
617	if (iommu_pmu->num_cntr > IOMMU_PMU_IDX_MAX) {
618		pr_warn_once("The number of IOMMU counters %d > max(%d), clipping!",
619			     iommu_pmu->num_cntr, IOMMU_PMU_IDX_MAX);
620		iommu_pmu->num_cntr = IOMMU_PMU_IDX_MAX;
621	}
622
623	iommu_pmu->cntr_width = pcap_cntr_width(perfcap);
624	iommu_pmu->filter = pcap_filters_mask(perfcap);
625	iommu_pmu->cntr_stride = pcap_cntr_stride(perfcap);
626	iommu_pmu->num_eg = pcap_num_event_group(perfcap);
627
628	iommu_pmu->evcap = kcalloc(iommu_pmu->num_eg, sizeof(u64), GFP_KERNEL);
629	if (!iommu_pmu->evcap) {
630		ret = -ENOMEM;
631		goto free_pmu;
632	}
633
634	/* Parse event group capabilities */
635	for (i = 0; i < iommu_pmu->num_eg; i++) {
636		u64 pcap;
637
638		pcap = dmar_readq(iommu->reg + DMAR_PERFEVNTCAP_REG +
639				  i * IOMMU_PMU_CAP_REGS_STEP);
640		iommu_pmu->evcap[i] = pecap_es(pcap);
641	}
642
643	iommu_pmu->cntr_evcap = kcalloc(iommu_pmu->num_cntr, sizeof(u32 *), GFP_KERNEL);
644	if (!iommu_pmu->cntr_evcap) {
645		ret = -ENOMEM;
646		goto free_pmu_evcap;
647	}
648	for (i = 0; i < iommu_pmu->num_cntr; i++) {
649		iommu_pmu->cntr_evcap[i] = kcalloc(iommu_pmu->num_eg, sizeof(u32), GFP_KERNEL);
650		if (!iommu_pmu->cntr_evcap[i]) {
651			ret = -ENOMEM;
652			goto free_pmu_cntr_evcap;
653		}
654		/*
655		 * Set to the global capabilities, will adjust according
656		 * to per-counter capabilities later.
657		 */
658		for (j = 0; j < iommu_pmu->num_eg; j++)
659			iommu_pmu->cntr_evcap[i][j] = (u32)iommu_pmu->evcap[j];
660	}
661
662	iommu_pmu->cfg_reg = get_perf_reg_address(iommu, DMAR_PERFCFGOFF_REG);
663	iommu_pmu->cntr_reg = get_perf_reg_address(iommu, DMAR_PERFCNTROFF_REG);
664	iommu_pmu->overflow = get_perf_reg_address(iommu, DMAR_PERFOVFOFF_REG);
665
666	/*
667	 * Check per-counter capabilities. All counters should have the
668	 * same capabilities on Interrupt on Overflow Support and Counter
669	 * Width.
670	 */
671	for (i = 0; i < iommu_pmu->num_cntr; i++) {
672		cap = dmar_readl(iommu_pmu->cfg_reg +
673				 i * IOMMU_PMU_CFG_OFFSET +
674				 IOMMU_PMU_CFG_CNTRCAP_OFFSET);
675		if (!iommu_cntrcap_pcc(cap))
676			continue;
677
678		/*
679		 * It's possible that some counters have a different
680		 * capability because of e.g., HW bug. Check the corner
681		 * case here and simply drop those counters.
682		 */
683		if ((iommu_cntrcap_cw(cap) != iommu_pmu->cntr_width) ||
684		    !iommu_cntrcap_ios(cap)) {
685			iommu_pmu->num_cntr = i;
686			pr_warn("PMU counter capability inconsistent, counter number reduced to %d\n",
687				iommu_pmu->num_cntr);
688		}
689
690		/* Clear the pre-defined events group */
691		for (j = 0; j < iommu_pmu->num_eg; j++)
692			iommu_pmu->cntr_evcap[i][j] = 0;
693
694		/* Override with per-counter event capabilities */
695		for (j = 0; j < iommu_cntrcap_egcnt(cap); j++) {
696			cap = dmar_readl(iommu_pmu->cfg_reg + i * IOMMU_PMU_CFG_OFFSET +
697					 IOMMU_PMU_CFG_CNTREVCAP_OFFSET +
698					 (j * IOMMU_PMU_OFF_REGS_STEP));
699			iommu_pmu->cntr_evcap[i][iommu_event_group(cap)] = iommu_event_select(cap);
700			/*
701			 * Some events may only be supported by a specific counter.
702			 * Track them in the evcap as well.
703			 */
704			iommu_pmu->evcap[iommu_event_group(cap)] |= iommu_event_select(cap);
705		}
706	}
707
708	iommu_pmu->iommu = iommu;
709	iommu->pmu = iommu_pmu;
710
711	return 0;
712
713free_pmu_cntr_evcap:
714	for (i = 0; i < iommu_pmu->num_cntr; i++)
715		kfree(iommu_pmu->cntr_evcap[i]);
716	kfree(iommu_pmu->cntr_evcap);
717free_pmu_evcap:
718	kfree(iommu_pmu->evcap);
719free_pmu:
720	kfree(iommu_pmu);
721
722	return ret;
723}
724
725void free_iommu_pmu(struct intel_iommu *iommu)
726{
727	struct iommu_pmu *iommu_pmu = iommu->pmu;
728
729	if (!iommu_pmu)
730		return;
731
732	if (iommu_pmu->evcap) {
733		int i;
734
735		for (i = 0; i < iommu_pmu->num_cntr; i++)
736			kfree(iommu_pmu->cntr_evcap[i]);
737		kfree(iommu_pmu->cntr_evcap);
738	}
739	kfree(iommu_pmu->evcap);
740	kfree(iommu_pmu);
741	iommu->pmu = NULL;
742}
743
744static int iommu_pmu_set_interrupt(struct intel_iommu *iommu)
745{
746	struct iommu_pmu *iommu_pmu = iommu->pmu;
747	int irq, ret;
748
749	irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PERF + iommu->seq_id, iommu->node, iommu);
750	if (irq <= 0)
751		return -EINVAL;
752
753	snprintf(iommu_pmu->irq_name, sizeof(iommu_pmu->irq_name), "dmar%d-perf", iommu->seq_id);
754
755	iommu->perf_irq = irq;
756	ret = request_threaded_irq(irq, NULL, iommu_pmu_irq_handler,
757				   IRQF_ONESHOT, iommu_pmu->irq_name, iommu);
758	if (ret) {
759		dmar_free_hwirq(irq);
760		iommu->perf_irq = 0;
761		return ret;
762	}
763	return 0;
764}
765
766static void iommu_pmu_unset_interrupt(struct intel_iommu *iommu)
767{
768	if (!iommu->perf_irq)
769		return;
770
771	free_irq(iommu->perf_irq, iommu);
772	dmar_free_hwirq(iommu->perf_irq);
773	iommu->perf_irq = 0;
774}
775
776static int iommu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
777{
778	struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node);
779
780	if (cpumask_empty(&iommu_pmu_cpu_mask))
781		cpumask_set_cpu(cpu, &iommu_pmu_cpu_mask);
782
783	if (cpumask_test_cpu(cpu, &iommu_pmu_cpu_mask))
784		iommu_pmu->cpu = cpu;
785
786	return 0;
787}
788
789static int iommu_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node)
790{
791	struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node);
792	int target = cpumask_first(&iommu_pmu_cpu_mask);
793
794	/*
795	 * The iommu_pmu_cpu_mask has been updated when offline the CPU
796	 * for the first iommu_pmu. Migrate the other iommu_pmu to the
797	 * new target.
798	 */
799	if (target < nr_cpu_ids && target != iommu_pmu->cpu) {
800		perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target);
801		iommu_pmu->cpu = target;
802		return 0;
803	}
804
805	if (!cpumask_test_and_clear_cpu(cpu, &iommu_pmu_cpu_mask))
806		return 0;
807
808	target = cpumask_any_but(cpu_online_mask, cpu);
809
810	if (target < nr_cpu_ids)
811		cpumask_set_cpu(target, &iommu_pmu_cpu_mask);
812	else
813		return 0;
814
815	perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target);
816	iommu_pmu->cpu = target;
817
818	return 0;
819}
820
821static int nr_iommu_pmu;
822static enum cpuhp_state iommu_cpuhp_slot;
823
824static int iommu_pmu_cpuhp_setup(struct iommu_pmu *iommu_pmu)
825{
826	int ret;
827
828	if (!nr_iommu_pmu) {
829		ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
830					      "driver/iommu/intel/perfmon:online",
831					      iommu_pmu_cpu_online,
832					      iommu_pmu_cpu_offline);
833		if (ret < 0)
834			return ret;
835		iommu_cpuhp_slot = ret;
836	}
837
838	ret = cpuhp_state_add_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node);
839	if (ret) {
840		if (!nr_iommu_pmu)
841			cpuhp_remove_multi_state(iommu_cpuhp_slot);
842		return ret;
843	}
844	nr_iommu_pmu++;
845
846	return 0;
847}
848
849static void iommu_pmu_cpuhp_free(struct iommu_pmu *iommu_pmu)
850{
851	cpuhp_state_remove_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node);
852
853	if (--nr_iommu_pmu)
854		return;
855
856	cpuhp_remove_multi_state(iommu_cpuhp_slot);
857}
858
859void iommu_pmu_register(struct intel_iommu *iommu)
860{
861	struct iommu_pmu *iommu_pmu = iommu->pmu;
862
863	if (!iommu_pmu)
864		return;
865
866	if (__iommu_pmu_register(iommu))
867		goto err;
868
869	if (iommu_pmu_cpuhp_setup(iommu_pmu))
870		goto unregister;
871
872	/* Set interrupt for overflow */
873	if (iommu_pmu_set_interrupt(iommu))
874		goto cpuhp_free;
875
876	return;
877
878cpuhp_free:
879	iommu_pmu_cpuhp_free(iommu_pmu);
880unregister:
881	perf_pmu_unregister(&iommu_pmu->pmu);
882err:
883	pr_err("Failed to register PMU for iommu (seq_id = %d)\n", iommu->seq_id);
884	free_iommu_pmu(iommu);
885}
886
887void iommu_pmu_unregister(struct intel_iommu *iommu)
888{
889	struct iommu_pmu *iommu_pmu = iommu->pmu;
890
891	if (!iommu_pmu)
892		return;
893
894	iommu_pmu_unset_interrupt(iommu);
895	iommu_pmu_cpuhp_free(iommu_pmu);
896	perf_pmu_unregister(&iommu_pmu->pmu);
897}
898