1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Augment the raw_syscalls tracepoints with the contents of the pointer arguments.
4 *
5 * This exactly matches what is marshalled into the raw_syscall:sys_enter
6 * payload expected by the 'perf trace' beautifiers.
7 */
8
9#include "vmlinux.h"
10#include <bpf/bpf_helpers.h>
11#include <linux/limits.h>
12
13/**
14 * is_power_of_2() - check if a value is a power of two
15 * @n: the value to check
16 *
17 * Determine whether some value is a power of two, where zero is *not*
18 * considered a power of two.  Return: true if @n is a power of 2, otherwise
19 * false.
20 */
21#define is_power_of_2(n) (n != 0 && ((n & (n - 1)) == 0))
22
23#define MAX_CPUS  4096
24
25/* bpf-output associated map */
26struct __augmented_syscalls__ {
27	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
28	__type(key, int);
29	__type(value, __u32);
30	__uint(max_entries, MAX_CPUS);
31} __augmented_syscalls__ SEC(".maps");
32
33/*
34 * What to augment at entry?
35 *
36 * Pointer arg payloads (filenames, etc) passed from userspace to the kernel
37 */
38struct syscalls_sys_enter {
39	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
40	__type(key, __u32);
41	__type(value, __u32);
42	__uint(max_entries, 512);
43} syscalls_sys_enter SEC(".maps");
44
45/*
46 * What to augment at exit?
47 *
48 * Pointer arg payloads returned from the kernel (struct stat, etc) to userspace.
49 */
50struct syscalls_sys_exit {
51	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
52	__type(key, __u32);
53	__type(value, __u32);
54	__uint(max_entries, 512);
55} syscalls_sys_exit SEC(".maps");
56
57struct syscall_enter_args {
58	unsigned long long common_tp_fields;
59	long		   syscall_nr;
60	unsigned long	   args[6];
61};
62
63struct syscall_exit_args {
64	unsigned long long common_tp_fields;
65	long		   syscall_nr;
66	long		   ret;
67};
68
69struct augmented_arg {
70	unsigned int	size;
71	int		err;
72	char		value[PATH_MAX];
73};
74
75struct pids_filtered {
76	__uint(type, BPF_MAP_TYPE_HASH);
77	__type(key, pid_t);
78	__type(value, bool);
79	__uint(max_entries, 64);
80} pids_filtered SEC(".maps");
81
82/*
83 * Desired design of maximum size and alignment (see RFC2553)
84 */
85#define SS_MAXSIZE   128     /* Implementation specific max size */
86
87typedef unsigned short sa_family_t;
88
89/*
90 * FIXME: Should come from system headers
91 *
92 * The definition uses anonymous union and struct in order to control the
93 * default alignment.
94 */
95struct sockaddr_storage {
96	union {
97		struct {
98			sa_family_t    ss_family; /* address family */
99			/* Following field(s) are implementation specific */
100			char __data[SS_MAXSIZE - sizeof(unsigned short)];
101				/* space to achieve desired size, */
102				/* _SS_MAXSIZE value minus size of ss_family */
103		};
104		void *__align; /* implementation specific desired alignment */
105	};
106};
107
108struct augmented_args_payload {
109       struct syscall_enter_args args;
110       union {
111		struct {
112			struct augmented_arg arg, arg2;
113		};
114		struct sockaddr_storage saddr;
115		char   __data[sizeof(struct augmented_arg)];
116	};
117};
118
119// We need more tmp space than the BPF stack can give us
120struct augmented_args_tmp {
121	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
122	__type(key, int);
123	__type(value, struct augmented_args_payload);
124	__uint(max_entries, 1);
125} augmented_args_tmp SEC(".maps");
126
127static inline struct augmented_args_payload *augmented_args_payload(void)
128{
129	int key = 0;
130	return bpf_map_lookup_elem(&augmented_args_tmp, &key);
131}
132
133static inline int augmented__output(void *ctx, struct augmented_args_payload *args, int len)
134{
135	/* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */
136	return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len);
137}
138
139static inline
140unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const void *arg, unsigned int arg_len)
141{
142	unsigned int augmented_len = sizeof(*augmented_arg);
143	int string_len = bpf_probe_read_user_str(&augmented_arg->value, arg_len, arg);
144
145	augmented_arg->size = augmented_arg->err = 0;
146	/*
147	 * probe_read_str may return < 0, e.g. -EFAULT
148	 * So we leave that in the augmented_arg->size that userspace will
149	 */
150	if (string_len > 0) {
151		augmented_len -= sizeof(augmented_arg->value) - string_len;
152		_Static_assert(is_power_of_2(sizeof(augmented_arg->value)), "sizeof(augmented_arg->value) needs to be a power of two");
153		augmented_len &= sizeof(augmented_arg->value) - 1;
154		augmented_arg->size = string_len;
155	} else {
156		/*
157		 * So that username notice the error while still being able
158		 * to skip this augmented arg record
159		 */
160		augmented_arg->err = string_len;
161		augmented_len = offsetof(struct augmented_arg, value);
162	}
163
164	return augmented_len;
165}
166
167SEC("tp/raw_syscalls/sys_enter")
168int syscall_unaugmented(struct syscall_enter_args *args)
169{
170	return 1;
171}
172
173/*
174 * These will be tail_called from SEC("raw_syscalls:sys_enter"), so will find in
175 * augmented_args_tmp what was read by that raw_syscalls:sys_enter and go
176 * on from there, reading the first syscall arg as a string, i.e. open's
177 * filename.
178 */
179SEC("tp/syscalls/sys_enter_connect")
180int sys_enter_connect(struct syscall_enter_args *args)
181{
182	struct augmented_args_payload *augmented_args = augmented_args_payload();
183	const void *sockaddr_arg = (const void *)args->args[1];
184	unsigned int socklen = args->args[2];
185	unsigned int len = sizeof(augmented_args->args);
186
187        if (augmented_args == NULL)
188                return 1; /* Failure: don't filter */
189
190	_Static_assert(is_power_of_2(sizeof(augmented_args->saddr)), "sizeof(augmented_args->saddr) needs to be a power of two");
191	socklen &= sizeof(augmented_args->saddr) - 1;
192
193	bpf_probe_read_user(&augmented_args->saddr, socklen, sockaddr_arg);
194
195	return augmented__output(args, augmented_args, len + socklen);
196}
197
198SEC("tp/syscalls/sys_enter_sendto")
199int sys_enter_sendto(struct syscall_enter_args *args)
200{
201	struct augmented_args_payload *augmented_args = augmented_args_payload();
202	const void *sockaddr_arg = (const void *)args->args[4];
203	unsigned int socklen = args->args[5];
204	unsigned int len = sizeof(augmented_args->args);
205
206        if (augmented_args == NULL)
207                return 1; /* Failure: don't filter */
208
209	socklen &= sizeof(augmented_args->saddr) - 1;
210
211	bpf_probe_read_user(&augmented_args->saddr, socklen, sockaddr_arg);
212
213	return augmented__output(args, augmented_args, len + socklen);
214}
215
216SEC("tp/syscalls/sys_enter_open")
217int sys_enter_open(struct syscall_enter_args *args)
218{
219	struct augmented_args_payload *augmented_args = augmented_args_payload();
220	const void *filename_arg = (const void *)args->args[0];
221	unsigned int len = sizeof(augmented_args->args);
222
223        if (augmented_args == NULL)
224                return 1; /* Failure: don't filter */
225
226	len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value));
227
228	return augmented__output(args, augmented_args, len);
229}
230
231SEC("tp/syscalls/sys_enter_openat")
232int sys_enter_openat(struct syscall_enter_args *args)
233{
234	struct augmented_args_payload *augmented_args = augmented_args_payload();
235	const void *filename_arg = (const void *)args->args[1];
236	unsigned int len = sizeof(augmented_args->args);
237
238        if (augmented_args == NULL)
239                return 1; /* Failure: don't filter */
240
241	len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value));
242
243	return augmented__output(args, augmented_args, len);
244}
245
246SEC("tp/syscalls/sys_enter_rename")
247int sys_enter_rename(struct syscall_enter_args *args)
248{
249	struct augmented_args_payload *augmented_args = augmented_args_payload();
250	const void *oldpath_arg = (const void *)args->args[0],
251		   *newpath_arg = (const void *)args->args[1];
252	unsigned int len = sizeof(augmented_args->args), oldpath_len;
253
254        if (augmented_args == NULL)
255                return 1; /* Failure: don't filter */
256
257	oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
258	len += oldpath_len + augmented_arg__read_str((void *)(&augmented_args->arg) + oldpath_len, newpath_arg, sizeof(augmented_args->arg.value));
259
260	return augmented__output(args, augmented_args, len);
261}
262
263SEC("tp/syscalls/sys_enter_renameat")
264int sys_enter_renameat(struct syscall_enter_args *args)
265{
266	struct augmented_args_payload *augmented_args = augmented_args_payload();
267	const void *oldpath_arg = (const void *)args->args[1],
268		   *newpath_arg = (const void *)args->args[3];
269	unsigned int len = sizeof(augmented_args->args), oldpath_len;
270
271        if (augmented_args == NULL)
272                return 1; /* Failure: don't filter */
273
274	oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
275	len += oldpath_len + augmented_arg__read_str((void *)(&augmented_args->arg) + oldpath_len, newpath_arg, sizeof(augmented_args->arg.value));
276
277	return augmented__output(args, augmented_args, len);
278}
279
280#define PERF_ATTR_SIZE_VER0     64      /* sizeof first published struct */
281
282// we need just the start, get the size to then copy it
283struct perf_event_attr_size {
284        __u32                   type;
285        /*
286         * Size of the attr structure, for fwd/bwd compat.
287         */
288        __u32                   size;
289};
290
291SEC("tp/syscalls/sys_enter_perf_event_open")
292int sys_enter_perf_event_open(struct syscall_enter_args *args)
293{
294	struct augmented_args_payload *augmented_args = augmented_args_payload();
295	const struct perf_event_attr_size *attr = (const struct perf_event_attr_size *)args->args[0], *attr_read;
296	unsigned int len = sizeof(augmented_args->args);
297
298        if (augmented_args == NULL)
299		goto failure;
300
301	if (bpf_probe_read_user(&augmented_args->__data, sizeof(*attr), attr) < 0)
302		goto failure;
303
304	attr_read = (const struct perf_event_attr_size *)augmented_args->__data;
305
306	__u32 size = attr_read->size;
307
308	if (!size)
309		size = PERF_ATTR_SIZE_VER0;
310
311	if (size > sizeof(augmented_args->__data))
312                goto failure;
313
314	// Now that we read attr->size and tested it against the size limits, read it completely
315	if (bpf_probe_read_user(&augmented_args->__data, size, attr) < 0)
316		goto failure;
317
318	return augmented__output(args, augmented_args, len + size);
319failure:
320	return 1; /* Failure: don't filter */
321}
322
323SEC("tp/syscalls/sys_enter_clock_nanosleep")
324int sys_enter_clock_nanosleep(struct syscall_enter_args *args)
325{
326	struct augmented_args_payload *augmented_args = augmented_args_payload();
327	const void *rqtp_arg = (const void *)args->args[2];
328	unsigned int len = sizeof(augmented_args->args);
329	__u32 size = sizeof(struct timespec64);
330
331        if (augmented_args == NULL)
332		goto failure;
333
334	if (size > sizeof(augmented_args->__data))
335                goto failure;
336
337	bpf_probe_read_user(&augmented_args->__data, size, rqtp_arg);
338
339	return augmented__output(args, augmented_args, len + size);
340failure:
341	return 1; /* Failure: don't filter */
342}
343
344static pid_t getpid(void)
345{
346	return bpf_get_current_pid_tgid();
347}
348
349static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
350{
351	return bpf_map_lookup_elem(pids, &pid) != NULL;
352}
353
354SEC("tp/raw_syscalls/sys_enter")
355int sys_enter(struct syscall_enter_args *args)
356{
357	struct augmented_args_payload *augmented_args;
358	/*
359	 * We start len, the amount of data that will be in the perf ring
360	 * buffer, if this is not filtered out by one of pid_filter__has(),
361	 * syscall->enabled, etc, with the non-augmented raw syscall payload,
362	 * i.e. sizeof(augmented_args->args).
363	 *
364	 * We'll add to this as we add augmented syscalls right after that
365	 * initial, non-augmented raw_syscalls:sys_enter payload.
366	 */
367
368	if (pid_filter__has(&pids_filtered, getpid()))
369		return 0;
370
371	augmented_args = augmented_args_payload();
372	if (augmented_args == NULL)
373		return 1;
374
375	bpf_probe_read_kernel(&augmented_args->args, sizeof(augmented_args->args), args);
376
377	/*
378	 * Jump to syscall specific augmenter, even if the default one,
379	 * "!raw_syscalls:unaugmented" that will just return 1 to return the
380	 * unaugmented tracepoint payload.
381	 */
382	bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr);
383
384	// If not found on the PROG_ARRAY syscalls map, then we're filtering it:
385	return 0;
386}
387
388SEC("tp/raw_syscalls/sys_exit")
389int sys_exit(struct syscall_exit_args *args)
390{
391	struct syscall_exit_args exit_args;
392
393	if (pid_filter__has(&pids_filtered, getpid()))
394		return 0;
395
396	bpf_probe_read_kernel(&exit_args, sizeof(exit_args), args);
397	/*
398	 * Jump to syscall specific return augmenter, even if the default one,
399	 * "!raw_syscalls:unaugmented" that will just return 1 to return the
400	 * unaugmented tracepoint payload.
401	 */
402	bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr);
403	/*
404	 * If not found on the PROG_ARRAY syscalls map, then we're filtering it:
405	 */
406	return 0;
407}
408
409char _license[] SEC("license") = "GPL";
410