1// SPDX-License-Identifier: GPL-2.0
2
3#include <linux/version.h>
4#include <linux/ptrace.h>
5#include <uapi/linux/bpf.h>
6#include <bpf/bpf_helpers.h>
7
8/*
9 * The CPU number, cstate number and pstate number are based
10 * on 96boards Hikey with octa CA53 CPUs.
11 *
12 * Every CPU have three idle states for cstate:
13 *   WFI, CPU_OFF, CLUSTER_OFF
14 *
15 * Every CPU have 5 operating points:
16 *   208MHz, 432MHz, 729MHz, 960MHz, 1200MHz
17 *
18 * This code is based on these assumption and other platforms
19 * need to adjust these definitions.
20 */
21#define MAX_CPU			8
22#define MAX_PSTATE_ENTRIES	5
23#define MAX_CSTATE_ENTRIES	3
24
25static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 };
26
27/*
28 * my_map structure is used to record cstate and pstate index and
29 * timestamp (Idx, Ts), when new event incoming we need to update
30 * combination for new state index and timestamp (Idx`, Ts`).
31 *
32 * Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time
33 * interval for the previous state: Duration(Idx) = Ts` - Ts.
34 *
35 * Every CPU has one below array for recording state index and
36 * timestamp, and record for cstate and pstate saperately:
37 *
38 * +--------------------------+
39 * | cstate timestamp         |
40 * +--------------------------+
41 * | cstate index             |
42 * +--------------------------+
43 * | pstate timestamp         |
44 * +--------------------------+
45 * | pstate index             |
46 * +--------------------------+
47 */
48#define MAP_OFF_CSTATE_TIME	0
49#define MAP_OFF_CSTATE_IDX	1
50#define MAP_OFF_PSTATE_TIME	2
51#define MAP_OFF_PSTATE_IDX	3
52#define MAP_OFF_NUM		4
53
54struct {
55	__uint(type, BPF_MAP_TYPE_ARRAY);
56	__type(key, u32);
57	__type(value, u64);
58	__uint(max_entries, MAX_CPU * MAP_OFF_NUM);
59} my_map SEC(".maps");
60
61/* cstate_duration records duration time for every idle state per CPU */
62struct {
63	__uint(type, BPF_MAP_TYPE_ARRAY);
64	__type(key, u32);
65	__type(value, u64);
66	__uint(max_entries, MAX_CPU * MAX_CSTATE_ENTRIES);
67} cstate_duration SEC(".maps");
68
69/* pstate_duration records duration time for every operating point per CPU */
70struct {
71	__uint(type, BPF_MAP_TYPE_ARRAY);
72	__type(key, u32);
73	__type(value, u64);
74	__uint(max_entries, MAX_CPU * MAX_PSTATE_ENTRIES);
75} pstate_duration SEC(".maps");
76
77/*
78 * The trace events for cpu_idle and cpu_frequency are taken from:
79 * /sys/kernel/tracing/events/power/cpu_idle/format
80 * /sys/kernel/tracing/events/power/cpu_frequency/format
81 *
82 * These two events have same format, so define one common structure.
83 */
84struct cpu_args {
85	u64 pad;
86	u32 state;
87	u32 cpu_id;
88};
89
90/* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */
91static u32 find_cpu_pstate_idx(u32 frequency)
92{
93	u32 i;
94
95	for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) {
96		if (frequency == cpu_opps[i])
97			return i;
98	}
99
100	return i;
101}
102
103SEC("tracepoint/power/cpu_idle")
104int bpf_prog1(struct cpu_args *ctx)
105{
106	u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta;
107	u32 key, cpu, pstate_idx;
108	u64 *val;
109
110	if (ctx->cpu_id > MAX_CPU)
111		return 0;
112
113	cpu = ctx->cpu_id;
114
115	key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME;
116	cts = bpf_map_lookup_elem(&my_map, &key);
117	if (!cts)
118		return 0;
119
120	key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
121	cstate = bpf_map_lookup_elem(&my_map, &key);
122	if (!cstate)
123		return 0;
124
125	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
126	pts = bpf_map_lookup_elem(&my_map, &key);
127	if (!pts)
128		return 0;
129
130	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
131	pstate = bpf_map_lookup_elem(&my_map, &key);
132	if (!pstate)
133		return 0;
134
135	prev_state = *cstate;
136	*cstate = ctx->state;
137
138	if (!*cts) {
139		*cts = bpf_ktime_get_ns();
140		return 0;
141	}
142
143	cur_ts = bpf_ktime_get_ns();
144	delta = cur_ts - *cts;
145	*cts = cur_ts;
146
147	/*
148	 * When state doesn't equal to (u32)-1, the cpu will enter
149	 * one idle state; for this case we need to record interval
150	 * for the pstate.
151	 *
152	 *                 OPP2
153	 *            +---------------------+
154	 *     OPP1   |                     |
155	 *   ---------+                     |
156	 *                                  |  Idle state
157	 *                                  +---------------
158	 *
159	 *            |<- pstate duration ->|
160	 *            ^                     ^
161	 *           pts                  cur_ts
162	 */
163	if (ctx->state != (u32)-1) {
164
165		/* record pstate after have first cpu_frequency event */
166		if (!*pts)
167			return 0;
168
169		delta = cur_ts - *pts;
170
171		pstate_idx = find_cpu_pstate_idx(*pstate);
172		if (pstate_idx >= MAX_PSTATE_ENTRIES)
173			return 0;
174
175		key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
176		val = bpf_map_lookup_elem(&pstate_duration, &key);
177		if (val)
178			__sync_fetch_and_add((long *)val, delta);
179
180	/*
181	 * When state equal to (u32)-1, the cpu just exits from one
182	 * specific idle state; for this case we need to record
183	 * interval for the pstate.
184	 *
185	 *       OPP2
186	 *   -----------+
187	 *              |                          OPP1
188	 *              |                     +-----------
189	 *              |     Idle state      |
190	 *              +---------------------+
191	 *
192	 *              |<- cstate duration ->|
193	 *              ^                     ^
194	 *             cts                  cur_ts
195	 */
196	} else {
197
198		key = cpu * MAX_CSTATE_ENTRIES + prev_state;
199		val = bpf_map_lookup_elem(&cstate_duration, &key);
200		if (val)
201			__sync_fetch_and_add((long *)val, delta);
202	}
203
204	/* Update timestamp for pstate as new start time */
205	if (*pts)
206		*pts = cur_ts;
207
208	return 0;
209}
210
211SEC("tracepoint/power/cpu_frequency")
212int bpf_prog2(struct cpu_args *ctx)
213{
214	u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta;
215	u32 key, cpu, pstate_idx;
216	u64 *val;
217
218	cpu = ctx->cpu_id;
219
220	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
221	pts = bpf_map_lookup_elem(&my_map, &key);
222	if (!pts)
223		return 0;
224
225	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
226	pstate = bpf_map_lookup_elem(&my_map, &key);
227	if (!pstate)
228		return 0;
229
230	key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
231	cstate = bpf_map_lookup_elem(&my_map, &key);
232	if (!cstate)
233		return 0;
234
235	prev_state = *pstate;
236	*pstate = ctx->state;
237
238	if (!*pts) {
239		*pts = bpf_ktime_get_ns();
240		return 0;
241	}
242
243	cur_ts = bpf_ktime_get_ns();
244	delta = cur_ts - *pts;
245	*pts = cur_ts;
246
247	/* When CPU is in idle, bail out to skip pstate statistics */
248	if (*cstate != (u32)(-1))
249		return 0;
250
251	/*
252	 * The cpu changes to another different OPP (in below diagram
253	 * change frequency from OPP3 to OPP1), need recording interval
254	 * for previous frequency OPP3 and update timestamp as start
255	 * time for new frequency OPP1.
256	 *
257	 *                 OPP3
258	 *            +---------------------+
259	 *     OPP2   |                     |
260	 *   ---------+                     |
261	 *                                  |    OPP1
262	 *                                  +---------------
263	 *
264	 *            |<- pstate duration ->|
265	 *            ^                     ^
266	 *           pts                  cur_ts
267	 */
268	pstate_idx = find_cpu_pstate_idx(*pstate);
269	if (pstate_idx >= MAX_PSTATE_ENTRIES)
270		return 0;
271
272	key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
273	val = bpf_map_lookup_elem(&pstate_duration, &key);
274	if (val)
275		__sync_fetch_and_add((long *)val, delta);
276
277	return 0;
278}
279
280char _license[] SEC("license") = "GPL";
281u32 _version SEC("version") = LINUX_VERSION_CODE;
282