1// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2// Copyright (c) 2022 Google
3#include "vmlinux.h"
4#include <bpf/bpf_helpers.h>
5#include <bpf/bpf_tracing.h>
6#include <bpf/bpf_core_read.h>
7#include <asm-generic/errno-base.h>
8
9#include "lock_data.h"
10
11/* for collect_lock_syms().  4096 was rejected by the verifier */
12#define MAX_CPUS  1024
13
14/* lock contention flags from include/trace/events/lock.h */
15#define LCB_F_SPIN	(1U << 0)
16#define LCB_F_READ	(1U << 1)
17#define LCB_F_WRITE	(1U << 2)
18#define LCB_F_RT	(1U << 3)
19#define LCB_F_PERCPU	(1U << 4)
20#define LCB_F_MUTEX	(1U << 5)
21
22/* callstack storage  */
23struct {
24	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
25	__uint(key_size, sizeof(__u32));
26	__uint(value_size, sizeof(__u64));
27	__uint(max_entries, MAX_ENTRIES);
28} stacks SEC(".maps");
29
30/* maintain timestamp at the beginning of contention */
31struct {
32	__uint(type, BPF_MAP_TYPE_HASH);
33	__type(key, int);
34	__type(value, struct tstamp_data);
35	__uint(max_entries, MAX_ENTRIES);
36} tstamp SEC(".maps");
37
38/* maintain per-CPU timestamp at the beginning of contention */
39struct {
40	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
41	__uint(key_size, sizeof(__u32));
42	__uint(value_size, sizeof(struct tstamp_data));
43	__uint(max_entries, 1);
44} tstamp_cpu SEC(".maps");
45
46/* actual lock contention statistics */
47struct {
48	__uint(type, BPF_MAP_TYPE_HASH);
49	__uint(key_size, sizeof(struct contention_key));
50	__uint(value_size, sizeof(struct contention_data));
51	__uint(max_entries, MAX_ENTRIES);
52} lock_stat SEC(".maps");
53
54struct {
55	__uint(type, BPF_MAP_TYPE_HASH);
56	__uint(key_size, sizeof(__u32));
57	__uint(value_size, sizeof(struct contention_task_data));
58	__uint(max_entries, MAX_ENTRIES);
59} task_data SEC(".maps");
60
61struct {
62	__uint(type, BPF_MAP_TYPE_HASH);
63	__uint(key_size, sizeof(__u64));
64	__uint(value_size, sizeof(__u32));
65	__uint(max_entries, MAX_ENTRIES);
66} lock_syms SEC(".maps");
67
68struct {
69	__uint(type, BPF_MAP_TYPE_HASH);
70	__uint(key_size, sizeof(__u32));
71	__uint(value_size, sizeof(__u8));
72	__uint(max_entries, 1);
73} cpu_filter SEC(".maps");
74
75struct {
76	__uint(type, BPF_MAP_TYPE_HASH);
77	__uint(key_size, sizeof(__u32));
78	__uint(value_size, sizeof(__u8));
79	__uint(max_entries, 1);
80} task_filter SEC(".maps");
81
82struct {
83	__uint(type, BPF_MAP_TYPE_HASH);
84	__uint(key_size, sizeof(__u32));
85	__uint(value_size, sizeof(__u8));
86	__uint(max_entries, 1);
87} type_filter SEC(".maps");
88
89struct {
90	__uint(type, BPF_MAP_TYPE_HASH);
91	__uint(key_size, sizeof(__u64));
92	__uint(value_size, sizeof(__u8));
93	__uint(max_entries, 1);
94} addr_filter SEC(".maps");
95
96struct {
97	__uint(type, BPF_MAP_TYPE_HASH);
98	__uint(key_size, sizeof(__u64));
99	__uint(value_size, sizeof(__u8));
100	__uint(max_entries, 1);
101} cgroup_filter SEC(".maps");
102
103struct rw_semaphore___old {
104	struct task_struct *owner;
105} __attribute__((preserve_access_index));
106
107struct rw_semaphore___new {
108	atomic_long_t owner;
109} __attribute__((preserve_access_index));
110
111struct mm_struct___old {
112	struct rw_semaphore mmap_sem;
113} __attribute__((preserve_access_index));
114
115struct mm_struct___new {
116	struct rw_semaphore mmap_lock;
117} __attribute__((preserve_access_index));
118
119/* control flags */
120int enabled;
121int has_cpu;
122int has_task;
123int has_type;
124int has_addr;
125int has_cgroup;
126int needs_callstack;
127int stack_skip;
128int lock_owner;
129
130int use_cgroup_v2;
131int perf_subsys_id = -1;
132
133/* determine the key of lock stat */
134int aggr_mode;
135
136__u64 end_ts;
137
138/* error stat */
139int task_fail;
140int stack_fail;
141int time_fail;
142int data_fail;
143
144int task_map_full;
145int data_map_full;
146
147static inline __u64 get_current_cgroup_id(void)
148{
149	struct task_struct *task;
150	struct cgroup *cgrp;
151
152	if (use_cgroup_v2)
153		return bpf_get_current_cgroup_id();
154
155	task = bpf_get_current_task_btf();
156
157	if (perf_subsys_id == -1) {
158#if __has_builtin(__builtin_preserve_enum_value)
159		perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
160						     perf_event_cgrp_id);
161#else
162		perf_subsys_id = perf_event_cgrp_id;
163#endif
164	}
165
166	cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup);
167	return BPF_CORE_READ(cgrp, kn, id);
168}
169
170static inline int can_record(u64 *ctx)
171{
172	if (has_cpu) {
173		__u32 cpu = bpf_get_smp_processor_id();
174		__u8 *ok;
175
176		ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
177		if (!ok)
178			return 0;
179	}
180
181	if (has_task) {
182		__u8 *ok;
183		__u32 pid = bpf_get_current_pid_tgid();
184
185		ok = bpf_map_lookup_elem(&task_filter, &pid);
186		if (!ok)
187			return 0;
188	}
189
190	if (has_type) {
191		__u8 *ok;
192		__u32 flags = (__u32)ctx[1];
193
194		ok = bpf_map_lookup_elem(&type_filter, &flags);
195		if (!ok)
196			return 0;
197	}
198
199	if (has_addr) {
200		__u8 *ok;
201		__u64 addr = ctx[0];
202
203		ok = bpf_map_lookup_elem(&addr_filter, &addr);
204		if (!ok)
205			return 0;
206	}
207
208	if (has_cgroup) {
209		__u8 *ok;
210		__u64 cgrp = get_current_cgroup_id();
211
212		ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp);
213		if (!ok)
214			return 0;
215	}
216
217	return 1;
218}
219
220static inline int update_task_data(struct task_struct *task)
221{
222	struct contention_task_data *p;
223	int pid, err;
224
225	err = bpf_core_read(&pid, sizeof(pid), &task->pid);
226	if (err)
227		return -1;
228
229	p = bpf_map_lookup_elem(&task_data, &pid);
230	if (p == NULL && !task_map_full) {
231		struct contention_task_data data = {};
232
233		BPF_CORE_READ_STR_INTO(&data.comm, task, comm);
234		if (bpf_map_update_elem(&task_data, &pid, &data, BPF_NOEXIST) == -E2BIG)
235			task_map_full = 1;
236	}
237
238	return 0;
239}
240
241#ifndef __has_builtin
242# define __has_builtin(x) 0
243#endif
244
245static inline struct task_struct *get_lock_owner(__u64 lock, __u32 flags)
246{
247	struct task_struct *task;
248	__u64 owner = 0;
249
250	if (flags & LCB_F_MUTEX) {
251		struct mutex *mutex = (void *)lock;
252		owner = BPF_CORE_READ(mutex, owner.counter);
253	} else if (flags == LCB_F_READ || flags == LCB_F_WRITE) {
254	/*
255	 * Support for the BPF_TYPE_MATCHES argument to the
256	 * __builtin_preserve_type_info builtin was added at some point during
257	 * development of clang 15 and it's what is needed for
258	 * bpf_core_type_matches.
259	 */
260#if __has_builtin(__builtin_preserve_type_info) && __clang_major__ >= 15
261		if (bpf_core_type_matches(struct rw_semaphore___old)) {
262			struct rw_semaphore___old *rwsem = (void *)lock;
263			owner = (unsigned long)BPF_CORE_READ(rwsem, owner);
264		} else if (bpf_core_type_matches(struct rw_semaphore___new)) {
265			struct rw_semaphore___new *rwsem = (void *)lock;
266			owner = BPF_CORE_READ(rwsem, owner.counter);
267		}
268#else
269		/* assume new struct */
270		struct rw_semaphore *rwsem = (void *)lock;
271		owner = BPF_CORE_READ(rwsem, owner.counter);
272#endif
273	}
274
275	if (!owner)
276		return NULL;
277
278	task = (void *)(owner & ~7UL);
279	return task;
280}
281
282static inline __u32 check_lock_type(__u64 lock, __u32 flags)
283{
284	struct task_struct *curr;
285	struct mm_struct___old *mm_old;
286	struct mm_struct___new *mm_new;
287	struct sighand_struct *sighand;
288
289	switch (flags) {
290	case LCB_F_READ:  /* rwsem */
291	case LCB_F_WRITE:
292		curr = bpf_get_current_task_btf();
293		if (curr->mm == NULL)
294			break;
295		mm_new = (void *)curr->mm;
296		if (bpf_core_field_exists(mm_new->mmap_lock)) {
297			if (&mm_new->mmap_lock == (void *)lock)
298				return LCD_F_MMAP_LOCK;
299			break;
300		}
301		mm_old = (void *)curr->mm;
302		if (bpf_core_field_exists(mm_old->mmap_sem)) {
303			if (&mm_old->mmap_sem == (void *)lock)
304				return LCD_F_MMAP_LOCK;
305		}
306		break;
307	case LCB_F_SPIN:  /* spinlock */
308		curr = bpf_get_current_task_btf();
309		sighand = curr->sighand;
310
311		if (sighand && &sighand->siglock == (void *)lock)
312			return LCD_F_SIGHAND_LOCK;
313		break;
314	default:
315		break;
316	}
317	return 0;
318}
319
320static inline struct tstamp_data *get_tstamp_elem(__u32 flags)
321{
322	__u32 pid;
323	struct tstamp_data *pelem;
324
325	/* Use per-cpu array map for spinlock and rwlock */
326	if (flags == (LCB_F_SPIN | LCB_F_READ) || flags == LCB_F_SPIN ||
327	    flags == (LCB_F_SPIN | LCB_F_WRITE)) {
328		__u32 idx = 0;
329
330		pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
331		/* Do not update the element for nested locks */
332		if (pelem && pelem->lock)
333			pelem = NULL;
334		return pelem;
335	}
336
337	pid = bpf_get_current_pid_tgid();
338	pelem = bpf_map_lookup_elem(&tstamp, &pid);
339	/* Do not update the element for nested locks */
340	if (pelem && pelem->lock)
341		return NULL;
342
343	if (pelem == NULL) {
344		struct tstamp_data zero = {};
345
346		if (bpf_map_update_elem(&tstamp, &pid, &zero, BPF_NOEXIST) < 0) {
347			__sync_fetch_and_add(&task_fail, 1);
348			return NULL;
349		}
350
351		pelem = bpf_map_lookup_elem(&tstamp, &pid);
352		if (pelem == NULL) {
353			__sync_fetch_and_add(&task_fail, 1);
354			return NULL;
355		}
356	}
357	return pelem;
358}
359
360SEC("tp_btf/contention_begin")
361int contention_begin(u64 *ctx)
362{
363	struct tstamp_data *pelem;
364
365	if (!enabled || !can_record(ctx))
366		return 0;
367
368	pelem = get_tstamp_elem(ctx[1]);
369	if (pelem == NULL)
370		return 0;
371
372	pelem->timestamp = bpf_ktime_get_ns();
373	pelem->lock = (__u64)ctx[0];
374	pelem->flags = (__u32)ctx[1];
375
376	if (needs_callstack) {
377		pelem->stack_id = bpf_get_stackid(ctx, &stacks,
378						  BPF_F_FAST_STACK_CMP | stack_skip);
379		if (pelem->stack_id < 0)
380			__sync_fetch_and_add(&stack_fail, 1);
381	} else if (aggr_mode == LOCK_AGGR_TASK) {
382		struct task_struct *task;
383
384		if (lock_owner) {
385			task = get_lock_owner(pelem->lock, pelem->flags);
386
387			/* The flags is not used anymore.  Pass the owner pid. */
388			if (task)
389				pelem->flags = BPF_CORE_READ(task, pid);
390			else
391				pelem->flags = -1U;
392
393		} else {
394			task = bpf_get_current_task_btf();
395		}
396
397		if (task) {
398			if (update_task_data(task) < 0 && lock_owner)
399				pelem->flags = -1U;
400		}
401	}
402
403	return 0;
404}
405
406SEC("tp_btf/contention_end")
407int contention_end(u64 *ctx)
408{
409	__u32 pid = 0, idx = 0;
410	struct tstamp_data *pelem;
411	struct contention_key key = {};
412	struct contention_data *data;
413	__u64 duration;
414	bool need_delete = false;
415
416	if (!enabled)
417		return 0;
418
419	/*
420	 * For spinlock and rwlock, it needs to get the timestamp for the
421	 * per-cpu map.  However, contention_end does not have the flags
422	 * so it cannot know whether it reads percpu or hash map.
423	 *
424	 * Try per-cpu map first and check if there's active contention.
425	 * If it is, do not read hash map because it cannot go to sleeping
426	 * locks before releasing the spinning locks.
427	 */
428	pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
429	if (pelem && pelem->lock) {
430		if (pelem->lock != ctx[0])
431			return 0;
432	} else {
433		pid = bpf_get_current_pid_tgid();
434		pelem = bpf_map_lookup_elem(&tstamp, &pid);
435		if (!pelem || pelem->lock != ctx[0])
436			return 0;
437		need_delete = true;
438	}
439
440	duration = bpf_ktime_get_ns() - pelem->timestamp;
441	if ((__s64)duration < 0) {
442		pelem->lock = 0;
443		if (need_delete)
444			bpf_map_delete_elem(&tstamp, &pid);
445		__sync_fetch_and_add(&time_fail, 1);
446		return 0;
447	}
448
449	switch (aggr_mode) {
450	case LOCK_AGGR_CALLER:
451		key.stack_id = pelem->stack_id;
452		break;
453	case LOCK_AGGR_TASK:
454		if (lock_owner)
455			key.pid = pelem->flags;
456		else {
457			if (!need_delete)
458				pid = bpf_get_current_pid_tgid();
459			key.pid = pid;
460		}
461		if (needs_callstack)
462			key.stack_id = pelem->stack_id;
463		break;
464	case LOCK_AGGR_ADDR:
465		key.lock_addr_or_cgroup = pelem->lock;
466		if (needs_callstack)
467			key.stack_id = pelem->stack_id;
468		break;
469	case LOCK_AGGR_CGROUP:
470		key.lock_addr_or_cgroup = get_current_cgroup_id();
471		break;
472	default:
473		/* should not happen */
474		return 0;
475	}
476
477	data = bpf_map_lookup_elem(&lock_stat, &key);
478	if (!data) {
479		if (data_map_full) {
480			pelem->lock = 0;
481			if (need_delete)
482				bpf_map_delete_elem(&tstamp, &pid);
483			__sync_fetch_and_add(&data_fail, 1);
484			return 0;
485		}
486
487		struct contention_data first = {
488			.total_time = duration,
489			.max_time = duration,
490			.min_time = duration,
491			.count = 1,
492			.flags = pelem->flags,
493		};
494		int err;
495
496		if (aggr_mode == LOCK_AGGR_ADDR)
497			first.flags |= check_lock_type(pelem->lock, pelem->flags);
498
499		err = bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST);
500		if (err < 0) {
501			if (err == -E2BIG)
502				data_map_full = 1;
503			__sync_fetch_and_add(&data_fail, 1);
504		}
505		pelem->lock = 0;
506		if (need_delete)
507			bpf_map_delete_elem(&tstamp, &pid);
508		return 0;
509	}
510
511	__sync_fetch_and_add(&data->total_time, duration);
512	__sync_fetch_and_add(&data->count, 1);
513
514	/* FIXME: need atomic operations */
515	if (data->max_time < duration)
516		data->max_time = duration;
517	if (data->min_time > duration)
518		data->min_time = duration;
519
520	pelem->lock = 0;
521	if (need_delete)
522		bpf_map_delete_elem(&tstamp, &pid);
523	return 0;
524}
525
526extern struct rq runqueues __ksym;
527
528struct rq___old {
529	raw_spinlock_t lock;
530} __attribute__((preserve_access_index));
531
532struct rq___new {
533	raw_spinlock_t __lock;
534} __attribute__((preserve_access_index));
535
536SEC("raw_tp/bpf_test_finish")
537int BPF_PROG(collect_lock_syms)
538{
539	__u64 lock_addr, lock_off;
540	__u32 lock_flag;
541
542	if (bpf_core_field_exists(struct rq___new, __lock))
543		lock_off = offsetof(struct rq___new, __lock);
544	else
545		lock_off = offsetof(struct rq___old, lock);
546
547	for (int i = 0; i < MAX_CPUS; i++) {
548		struct rq *rq = bpf_per_cpu_ptr(&runqueues, i);
549
550		if (rq == NULL)
551			break;
552
553		lock_addr = (__u64)(void *)rq + lock_off;
554		lock_flag = LOCK_CLASS_RQLOCK;
555		bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
556	}
557	return 0;
558}
559
560SEC("raw_tp/bpf_test_finish")
561int BPF_PROG(end_timestamp)
562{
563	end_ts = bpf_ktime_get_ns();
564	return 0;
565}
566
567char LICENSE[] SEC("license") = "Dual BSD/GPL";
568