1// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2// Copyright (c) 2021 Facebook 3// Copyright (c) 2021 Google 4#include "vmlinux.h" 5#include <bpf/bpf_helpers.h> 6#include <bpf/bpf_tracing.h> 7#include <bpf/bpf_core_read.h> 8 9#define MAX_LEVELS 10 // max cgroup hierarchy level: arbitrary 10#define MAX_EVENTS 32 // max events per cgroup: arbitrary 11 12// NOTE: many of map and global data will be modified before loading 13// from the userspace (perf tool) using the skeleton helpers. 14 15// single set of global perf events to measure 16struct { 17 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 18 __uint(key_size, sizeof(__u32)); 19 __uint(value_size, sizeof(int)); 20 __uint(max_entries, 1); 21} events SEC(".maps"); 22 23// from cgroup id to event index 24struct { 25 __uint(type, BPF_MAP_TYPE_HASH); 26 __uint(key_size, sizeof(__u64)); 27 __uint(value_size, sizeof(__u32)); 28 __uint(max_entries, 1); 29} cgrp_idx SEC(".maps"); 30 31// per-cpu event snapshots to calculate delta 32struct { 33 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 34 __uint(key_size, sizeof(__u32)); 35 __uint(value_size, sizeof(struct bpf_perf_event_value)); 36} prev_readings SEC(".maps"); 37 38// aggregated event values for each cgroup (per-cpu) 39// will be read from the user-space 40struct { 41 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 42 __uint(key_size, sizeof(__u32)); 43 __uint(value_size, sizeof(struct bpf_perf_event_value)); 44} cgrp_readings SEC(".maps"); 45 46/* new kernel cgroup definition */ 47struct cgroup___new { 48 int level; 49 struct cgroup *ancestors[]; 50} __attribute__((preserve_access_index)); 51 52/* old kernel cgroup definition */ 53struct cgroup___old { 54 int level; 55 u64 ancestor_ids[]; 56} __attribute__((preserve_access_index)); 57 58const volatile __u32 num_events = 1; 59const volatile __u32 num_cpus = 1; 60 61int enabled = 0; 62int use_cgroup_v2 = 0; 63int perf_subsys_id = -1; 64 65static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level) 66{ 67 /* recast pointer to capture new type for compiler */ 68 struct cgroup___new *cgrp_new = (void *)cgrp; 69 70 if (bpf_core_field_exists(cgrp_new->ancestors)) { 71 return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id); 72 } else { 73 /* recast pointer to capture old type for compiler */ 74 struct cgroup___old *cgrp_old = (void *)cgrp; 75 76 return BPF_CORE_READ(cgrp_old, ancestor_ids[level]); 77 } 78} 79 80static inline int get_cgroup_v1_idx(__u32 *cgrps, int size) 81{ 82 struct task_struct *p = (void *)bpf_get_current_task(); 83 struct cgroup *cgrp; 84 register int i = 0; 85 __u32 *elem; 86 int level; 87 int cnt; 88 89 if (perf_subsys_id == -1) { 90#if __has_builtin(__builtin_preserve_enum_value) 91 perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id, 92 perf_event_cgrp_id); 93#else 94 perf_subsys_id = perf_event_cgrp_id; 95#endif 96 } 97 cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup); 98 level = BPF_CORE_READ(cgrp, level); 99 100 for (cnt = 0; i < MAX_LEVELS; i++) { 101 __u64 cgrp_id; 102 103 if (i > level) 104 break; 105 106 // convert cgroup-id to a map index 107 cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i); 108 elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); 109 if (!elem) 110 continue; 111 112 cgrps[cnt++] = *elem; 113 if (cnt == size) 114 break; 115 } 116 117 return cnt; 118} 119 120static inline int get_cgroup_v2_idx(__u32 *cgrps, int size) 121{ 122 register int i = 0; 123 __u32 *elem; 124 int cnt; 125 126 for (cnt = 0; i < MAX_LEVELS; i++) { 127 __u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i); 128 129 if (cgrp_id == 0) 130 break; 131 132 // convert cgroup-id to a map index 133 elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); 134 if (!elem) 135 continue; 136 137 cgrps[cnt++] = *elem; 138 if (cnt == size) 139 break; 140 } 141 142 return cnt; 143} 144 145static int bperf_cgroup_count(void) 146{ 147 register __u32 idx = 0; // to have it in a register to pass BPF verifier 148 register int c = 0; 149 struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val; 150 __u32 cpu = bpf_get_smp_processor_id(); 151 __u32 cgrp_idx[MAX_LEVELS]; 152 int cgrp_cnt; 153 __u32 key, cgrp; 154 long err; 155 156 if (use_cgroup_v2) 157 cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS); 158 else 159 cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS); 160 161 for ( ; idx < MAX_EVENTS; idx++) { 162 if (idx == num_events) 163 break; 164 165 // XXX: do not pass idx directly (for verifier) 166 key = idx; 167 // this is per-cpu array for diff 168 prev_val = bpf_map_lookup_elem(&prev_readings, &key); 169 if (!prev_val) { 170 val.counter = val.enabled = val.running = 0; 171 bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY); 172 173 prev_val = bpf_map_lookup_elem(&prev_readings, &key); 174 if (!prev_val) 175 continue; 176 } 177 178 // read from global perf_event array 179 key = idx * num_cpus + cpu; 180 err = bpf_perf_event_read_value(&events, key, &val, sizeof(val)); 181 if (err) 182 continue; 183 184 if (enabled) { 185 delta.counter = val.counter - prev_val->counter; 186 delta.enabled = val.enabled - prev_val->enabled; 187 delta.running = val.running - prev_val->running; 188 189 for (c = 0; c < MAX_LEVELS; c++) { 190 if (c == cgrp_cnt) 191 break; 192 193 cgrp = cgrp_idx[c]; 194 195 // aggregate the result by cgroup 196 key = cgrp * num_events + idx; 197 cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key); 198 if (cgrp_val) { 199 cgrp_val->counter += delta.counter; 200 cgrp_val->enabled += delta.enabled; 201 cgrp_val->running += delta.running; 202 } else { 203 bpf_map_update_elem(&cgrp_readings, &key, 204 &delta, BPF_ANY); 205 } 206 } 207 } 208 209 *prev_val = val; 210 } 211 return 0; 212} 213 214// This will be attached to cgroup-switches event for each cpu 215SEC("perf_event") 216int BPF_PROG(on_cgrp_switch) 217{ 218 return bperf_cgroup_count(); 219} 220 221SEC("raw_tp/sched_switch") 222int BPF_PROG(trigger_read) 223{ 224 return bperf_cgroup_count(); 225} 226 227char LICENSE[] SEC("license") = "Dual BSD/GPL"; 228