1// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2// Copyright (c) 2021 Facebook
3// Copyright (c) 2021 Google
4#include "vmlinux.h"
5#include <bpf/bpf_helpers.h>
6#include <bpf/bpf_tracing.h>
7#include <bpf/bpf_core_read.h>
8
9#define MAX_LEVELS  10  // max cgroup hierarchy level: arbitrary
10#define MAX_EVENTS  32  // max events per cgroup: arbitrary
11
12// NOTE: many of map and global data will be modified before loading
13//       from the userspace (perf tool) using the skeleton helpers.
14
15// single set of global perf events to measure
16struct {
17	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
18	__uint(key_size, sizeof(__u32));
19	__uint(value_size, sizeof(int));
20	__uint(max_entries, 1);
21} events SEC(".maps");
22
23// from cgroup id to event index
24struct {
25	__uint(type, BPF_MAP_TYPE_HASH);
26	__uint(key_size, sizeof(__u64));
27	__uint(value_size, sizeof(__u32));
28	__uint(max_entries, 1);
29} cgrp_idx SEC(".maps");
30
31// per-cpu event snapshots to calculate delta
32struct {
33	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
34	__uint(key_size, sizeof(__u32));
35	__uint(value_size, sizeof(struct bpf_perf_event_value));
36} prev_readings SEC(".maps");
37
38// aggregated event values for each cgroup (per-cpu)
39// will be read from the user-space
40struct {
41	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
42	__uint(key_size, sizeof(__u32));
43	__uint(value_size, sizeof(struct bpf_perf_event_value));
44} cgrp_readings SEC(".maps");
45
46/* new kernel cgroup definition */
47struct cgroup___new {
48	int level;
49	struct cgroup *ancestors[];
50} __attribute__((preserve_access_index));
51
52/* old kernel cgroup definition */
53struct cgroup___old {
54	int level;
55	u64 ancestor_ids[];
56} __attribute__((preserve_access_index));
57
58const volatile __u32 num_events = 1;
59const volatile __u32 num_cpus = 1;
60
61int enabled = 0;
62int use_cgroup_v2 = 0;
63int perf_subsys_id = -1;
64
65static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level)
66{
67	/* recast pointer to capture new type for compiler */
68	struct cgroup___new *cgrp_new = (void *)cgrp;
69
70	if (bpf_core_field_exists(cgrp_new->ancestors)) {
71		return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id);
72	} else {
73		/* recast pointer to capture old type for compiler */
74		struct cgroup___old *cgrp_old = (void *)cgrp;
75
76		return BPF_CORE_READ(cgrp_old, ancestor_ids[level]);
77	}
78}
79
80static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
81{
82	struct task_struct *p = (void *)bpf_get_current_task();
83	struct cgroup *cgrp;
84	register int i = 0;
85	__u32 *elem;
86	int level;
87	int cnt;
88
89	if (perf_subsys_id == -1) {
90#if __has_builtin(__builtin_preserve_enum_value)
91		perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
92						     perf_event_cgrp_id);
93#else
94		perf_subsys_id = perf_event_cgrp_id;
95#endif
96	}
97	cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup);
98	level = BPF_CORE_READ(cgrp, level);
99
100	for (cnt = 0; i < MAX_LEVELS; i++) {
101		__u64 cgrp_id;
102
103		if (i > level)
104			break;
105
106		// convert cgroup-id to a map index
107		cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i);
108		elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
109		if (!elem)
110			continue;
111
112		cgrps[cnt++] = *elem;
113		if (cnt == size)
114			break;
115	}
116
117	return cnt;
118}
119
120static inline int get_cgroup_v2_idx(__u32 *cgrps, int size)
121{
122	register int i = 0;
123	__u32 *elem;
124	int cnt;
125
126	for (cnt = 0; i < MAX_LEVELS; i++) {
127		__u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i);
128
129		if (cgrp_id == 0)
130			break;
131
132		// convert cgroup-id to a map index
133		elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
134		if (!elem)
135			continue;
136
137		cgrps[cnt++] = *elem;
138		if (cnt == size)
139			break;
140	}
141
142	return cnt;
143}
144
145static int bperf_cgroup_count(void)
146{
147	register __u32 idx = 0;  // to have it in a register to pass BPF verifier
148	register int c = 0;
149	struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val;
150	__u32 cpu = bpf_get_smp_processor_id();
151	__u32 cgrp_idx[MAX_LEVELS];
152	int cgrp_cnt;
153	__u32 key, cgrp;
154	long err;
155
156	if (use_cgroup_v2)
157		cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS);
158	else
159		cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS);
160
161	for ( ; idx < MAX_EVENTS; idx++) {
162		if (idx == num_events)
163			break;
164
165		// XXX: do not pass idx directly (for verifier)
166		key = idx;
167		// this is per-cpu array for diff
168		prev_val = bpf_map_lookup_elem(&prev_readings, &key);
169		if (!prev_val) {
170			val.counter = val.enabled = val.running = 0;
171			bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY);
172
173			prev_val = bpf_map_lookup_elem(&prev_readings, &key);
174			if (!prev_val)
175				continue;
176		}
177
178		// read from global perf_event array
179		key = idx * num_cpus + cpu;
180		err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
181		if (err)
182			continue;
183
184		if (enabled) {
185			delta.counter = val.counter - prev_val->counter;
186			delta.enabled = val.enabled - prev_val->enabled;
187			delta.running = val.running - prev_val->running;
188
189			for (c = 0; c < MAX_LEVELS; c++) {
190				if (c == cgrp_cnt)
191					break;
192
193				cgrp = cgrp_idx[c];
194
195				// aggregate the result by cgroup
196				key = cgrp * num_events + idx;
197				cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key);
198				if (cgrp_val) {
199					cgrp_val->counter += delta.counter;
200					cgrp_val->enabled += delta.enabled;
201					cgrp_val->running += delta.running;
202				} else {
203					bpf_map_update_elem(&cgrp_readings, &key,
204							    &delta, BPF_ANY);
205				}
206			}
207		}
208
209		*prev_val = val;
210	}
211	return 0;
212}
213
214// This will be attached to cgroup-switches event for each cpu
215SEC("perf_event")
216int BPF_PROG(on_cgrp_switch)
217{
218	return bperf_cgroup_count();
219}
220
221SEC("raw_tp/sched_switch")
222int BPF_PROG(trigger_read)
223{
224	return bperf_cgroup_count();
225}
226
227char LICENSE[] SEC("license") = "Dual BSD/GPL";
228