1// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2// Copyright (c) 2022 Google
3#include "vmlinux.h"
4#include <bpf/bpf_helpers.h>
5#include <bpf/bpf_tracing.h>
6#include <bpf/bpf_core_read.h>
7
8/* task->flags for off-cpu analysis */
9#define PF_KTHREAD   0x00200000  /* I am a kernel thread */
10
11/* task->state for off-cpu analysis */
12#define TASK_INTERRUPTIBLE	0x0001
13#define TASK_UNINTERRUPTIBLE	0x0002
14
15/* create a new thread */
16#define CLONE_THREAD  0x10000
17
18#define MAX_STACKS   32
19#define MAX_ENTRIES  102400
20
21struct tstamp_data {
22	__u32 stack_id;
23	__u32 state;
24	__u64 timestamp;
25};
26
27struct offcpu_key {
28	__u32 pid;
29	__u32 tgid;
30	__u32 stack_id;
31	__u32 state;
32	__u64 cgroup_id;
33};
34
35struct {
36	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
37	__uint(key_size, sizeof(__u32));
38	__uint(value_size, MAX_STACKS * sizeof(__u64));
39	__uint(max_entries, MAX_ENTRIES);
40} stacks SEC(".maps");
41
42struct {
43	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
44	__uint(map_flags, BPF_F_NO_PREALLOC);
45	__type(key, int);
46	__type(value, struct tstamp_data);
47} tstamp SEC(".maps");
48
49struct {
50	__uint(type, BPF_MAP_TYPE_HASH);
51	__uint(key_size, sizeof(struct offcpu_key));
52	__uint(value_size, sizeof(__u64));
53	__uint(max_entries, MAX_ENTRIES);
54} off_cpu SEC(".maps");
55
56struct {
57	__uint(type, BPF_MAP_TYPE_HASH);
58	__uint(key_size, sizeof(__u32));
59	__uint(value_size, sizeof(__u8));
60	__uint(max_entries, 1);
61} cpu_filter SEC(".maps");
62
63struct {
64	__uint(type, BPF_MAP_TYPE_HASH);
65	__uint(key_size, sizeof(__u32));
66	__uint(value_size, sizeof(__u8));
67	__uint(max_entries, 1);
68} task_filter SEC(".maps");
69
70struct {
71	__uint(type, BPF_MAP_TYPE_HASH);
72	__uint(key_size, sizeof(__u64));
73	__uint(value_size, sizeof(__u8));
74	__uint(max_entries, 1);
75} cgroup_filter SEC(".maps");
76
77/* new kernel task_struct definition */
78struct task_struct___new {
79	long __state;
80} __attribute__((preserve_access_index));
81
82/* old kernel task_struct definition */
83struct task_struct___old {
84	long state;
85} __attribute__((preserve_access_index));
86
87int enabled = 0;
88int has_cpu = 0;
89int has_task = 0;
90int has_cgroup = 0;
91int uses_tgid = 0;
92
93const volatile bool has_prev_state = false;
94const volatile bool needs_cgroup = false;
95const volatile bool uses_cgroup_v1 = false;
96
97int perf_subsys_id = -1;
98
99/*
100 * Old kernel used to call it task_struct->state and now it's '__state'.
101 * Use BPF CO-RE "ignored suffix rule" to deal with it like below:
102 *
103 * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
104 */
105static inline int get_task_state(struct task_struct *t)
106{
107	/* recast pointer to capture new type for compiler */
108	struct task_struct___new *t_new = (void *)t;
109
110	if (bpf_core_field_exists(t_new->__state)) {
111		return BPF_CORE_READ(t_new, __state);
112	} else {
113		/* recast pointer to capture old type for compiler */
114		struct task_struct___old *t_old = (void *)t;
115
116		return BPF_CORE_READ(t_old, state);
117	}
118}
119
120static inline __u64 get_cgroup_id(struct task_struct *t)
121{
122	struct cgroup *cgrp;
123
124	if (!uses_cgroup_v1)
125		return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id);
126
127	if (perf_subsys_id == -1) {
128#if __has_builtin(__builtin_preserve_enum_value)
129		perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
130						     perf_event_cgrp_id);
131#else
132		perf_subsys_id = perf_event_cgrp_id;
133#endif
134	}
135
136	cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup);
137	return BPF_CORE_READ(cgrp, kn, id);
138}
139
140static inline int can_record(struct task_struct *t, int state)
141{
142	/* kernel threads don't have user stack */
143	if (t->flags & PF_KTHREAD)
144		return 0;
145
146	if (state != TASK_INTERRUPTIBLE &&
147	    state != TASK_UNINTERRUPTIBLE)
148		return 0;
149
150	if (has_cpu) {
151		__u32 cpu = bpf_get_smp_processor_id();
152		__u8 *ok;
153
154		ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
155		if (!ok)
156			return 0;
157	}
158
159	if (has_task) {
160		__u8 *ok;
161		__u32 pid;
162
163		if (uses_tgid)
164			pid = t->tgid;
165		else
166			pid = t->pid;
167
168		ok = bpf_map_lookup_elem(&task_filter, &pid);
169		if (!ok)
170			return 0;
171	}
172
173	if (has_cgroup) {
174		__u8 *ok;
175		__u64 cgrp_id = get_cgroup_id(t);
176
177		ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
178		if (!ok)
179			return 0;
180	}
181
182	return 1;
183}
184
185static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
186			struct task_struct *next, int state)
187{
188	__u64 ts;
189	__u32 stack_id;
190	struct tstamp_data *pelem;
191
192	ts = bpf_ktime_get_ns();
193
194	if (!can_record(prev, state))
195		goto next;
196
197	stack_id = bpf_get_stackid(ctx, &stacks,
198				   BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);
199
200	pelem = bpf_task_storage_get(&tstamp, prev, NULL,
201				     BPF_LOCAL_STORAGE_GET_F_CREATE);
202	if (!pelem)
203		goto next;
204
205	pelem->timestamp = ts;
206	pelem->state = state;
207	pelem->stack_id = stack_id;
208
209next:
210	pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);
211
212	if (pelem && pelem->timestamp) {
213		struct offcpu_key key = {
214			.pid = next->pid,
215			.tgid = next->tgid,
216			.stack_id = pelem->stack_id,
217			.state = pelem->state,
218			.cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
219		};
220		__u64 delta = ts - pelem->timestamp;
221		__u64 *total;
222
223		total = bpf_map_lookup_elem(&off_cpu, &key);
224		if (total)
225			*total += delta;
226		else
227			bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
228
229		/* prevent to reuse the timestamp later */
230		pelem->timestamp = 0;
231	}
232
233	return 0;
234}
235
236SEC("tp_btf/task_newtask")
237int on_newtask(u64 *ctx)
238{
239	struct task_struct *task;
240	u64 clone_flags;
241	u32 pid;
242	u8 val = 1;
243
244	if (!uses_tgid)
245		return 0;
246
247	task = (struct task_struct *)bpf_get_current_task();
248
249	pid = BPF_CORE_READ(task, tgid);
250	if (!bpf_map_lookup_elem(&task_filter, &pid))
251		return 0;
252
253	task = (struct task_struct *)ctx[0];
254	clone_flags = ctx[1];
255
256	pid = task->tgid;
257	if (!(clone_flags & CLONE_THREAD))
258		bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST);
259
260	return 0;
261}
262
263SEC("tp_btf/sched_switch")
264int on_switch(u64 *ctx)
265{
266	struct task_struct *prev, *next;
267	int prev_state;
268
269	if (!enabled)
270		return 0;
271
272	prev = (struct task_struct *)ctx[1];
273	next = (struct task_struct *)ctx[2];
274
275	if (has_prev_state)
276		prev_state = (int)ctx[3];
277	else
278		prev_state = get_task_state(prev);
279
280	return off_cpu_stat(ctx, prev, next, prev_state & 0xff);
281}
282
283char LICENSE[] SEC("license") = "Dual BSD/GPL";
284