1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Augment the raw_syscalls tracepoints with the contents of the pointer arguments. 4 * 5 * This exactly matches what is marshalled into the raw_syscall:sys_enter 6 * payload expected by the 'perf trace' beautifiers. 7 */ 8 9#include "vmlinux.h" 10#include <bpf/bpf_helpers.h> 11#include <linux/limits.h> 12 13/** 14 * is_power_of_2() - check if a value is a power of two 15 * @n: the value to check 16 * 17 * Determine whether some value is a power of two, where zero is *not* 18 * considered a power of two. Return: true if @n is a power of 2, otherwise 19 * false. 20 */ 21#define is_power_of_2(n) (n != 0 && ((n & (n - 1)) == 0)) 22 23#define MAX_CPUS 4096 24 25/* bpf-output associated map */ 26struct __augmented_syscalls__ { 27 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 28 __type(key, int); 29 __type(value, __u32); 30 __uint(max_entries, MAX_CPUS); 31} __augmented_syscalls__ SEC(".maps"); 32 33/* 34 * What to augment at entry? 35 * 36 * Pointer arg payloads (filenames, etc) passed from userspace to the kernel 37 */ 38struct syscalls_sys_enter { 39 __uint(type, BPF_MAP_TYPE_PROG_ARRAY); 40 __type(key, __u32); 41 __type(value, __u32); 42 __uint(max_entries, 512); 43} syscalls_sys_enter SEC(".maps"); 44 45/* 46 * What to augment at exit? 47 * 48 * Pointer arg payloads returned from the kernel (struct stat, etc) to userspace. 49 */ 50struct syscalls_sys_exit { 51 __uint(type, BPF_MAP_TYPE_PROG_ARRAY); 52 __type(key, __u32); 53 __type(value, __u32); 54 __uint(max_entries, 512); 55} syscalls_sys_exit SEC(".maps"); 56 57struct syscall_enter_args { 58 unsigned long long common_tp_fields; 59 long syscall_nr; 60 unsigned long args[6]; 61}; 62 63struct syscall_exit_args { 64 unsigned long long common_tp_fields; 65 long syscall_nr; 66 long ret; 67}; 68 69struct augmented_arg { 70 unsigned int size; 71 int err; 72 char value[PATH_MAX]; 73}; 74 75struct pids_filtered { 76 __uint(type, BPF_MAP_TYPE_HASH); 77 __type(key, pid_t); 78 __type(value, bool); 79 __uint(max_entries, 64); 80} pids_filtered SEC(".maps"); 81 82/* 83 * Desired design of maximum size and alignment (see RFC2553) 84 */ 85#define SS_MAXSIZE 128 /* Implementation specific max size */ 86 87typedef unsigned short sa_family_t; 88 89/* 90 * FIXME: Should come from system headers 91 * 92 * The definition uses anonymous union and struct in order to control the 93 * default alignment. 94 */ 95struct sockaddr_storage { 96 union { 97 struct { 98 sa_family_t ss_family; /* address family */ 99 /* Following field(s) are implementation specific */ 100 char __data[SS_MAXSIZE - sizeof(unsigned short)]; 101 /* space to achieve desired size, */ 102 /* _SS_MAXSIZE value minus size of ss_family */ 103 }; 104 void *__align; /* implementation specific desired alignment */ 105 }; 106}; 107 108struct augmented_args_payload { 109 struct syscall_enter_args args; 110 union { 111 struct { 112 struct augmented_arg arg, arg2; 113 }; 114 struct sockaddr_storage saddr; 115 char __data[sizeof(struct augmented_arg)]; 116 }; 117}; 118 119// We need more tmp space than the BPF stack can give us 120struct augmented_args_tmp { 121 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 122 __type(key, int); 123 __type(value, struct augmented_args_payload); 124 __uint(max_entries, 1); 125} augmented_args_tmp SEC(".maps"); 126 127static inline struct augmented_args_payload *augmented_args_payload(void) 128{ 129 int key = 0; 130 return bpf_map_lookup_elem(&augmented_args_tmp, &key); 131} 132 133static inline int augmented__output(void *ctx, struct augmented_args_payload *args, int len) 134{ 135 /* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */ 136 return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len); 137} 138 139static inline 140unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const void *arg, unsigned int arg_len) 141{ 142 unsigned int augmented_len = sizeof(*augmented_arg); 143 int string_len = bpf_probe_read_user_str(&augmented_arg->value, arg_len, arg); 144 145 augmented_arg->size = augmented_arg->err = 0; 146 /* 147 * probe_read_str may return < 0, e.g. -EFAULT 148 * So we leave that in the augmented_arg->size that userspace will 149 */ 150 if (string_len > 0) { 151 augmented_len -= sizeof(augmented_arg->value) - string_len; 152 _Static_assert(is_power_of_2(sizeof(augmented_arg->value)), "sizeof(augmented_arg->value) needs to be a power of two"); 153 augmented_len &= sizeof(augmented_arg->value) - 1; 154 augmented_arg->size = string_len; 155 } else { 156 /* 157 * So that username notice the error while still being able 158 * to skip this augmented arg record 159 */ 160 augmented_arg->err = string_len; 161 augmented_len = offsetof(struct augmented_arg, value); 162 } 163 164 return augmented_len; 165} 166 167SEC("tp/raw_syscalls/sys_enter") 168int syscall_unaugmented(struct syscall_enter_args *args) 169{ 170 return 1; 171} 172 173/* 174 * These will be tail_called from SEC("raw_syscalls:sys_enter"), so will find in 175 * augmented_args_tmp what was read by that raw_syscalls:sys_enter and go 176 * on from there, reading the first syscall arg as a string, i.e. open's 177 * filename. 178 */ 179SEC("tp/syscalls/sys_enter_connect") 180int sys_enter_connect(struct syscall_enter_args *args) 181{ 182 struct augmented_args_payload *augmented_args = augmented_args_payload(); 183 const void *sockaddr_arg = (const void *)args->args[1]; 184 unsigned int socklen = args->args[2]; 185 unsigned int len = sizeof(augmented_args->args); 186 187 if (augmented_args == NULL) 188 return 1; /* Failure: don't filter */ 189 190 _Static_assert(is_power_of_2(sizeof(augmented_args->saddr)), "sizeof(augmented_args->saddr) needs to be a power of two"); 191 socklen &= sizeof(augmented_args->saddr) - 1; 192 193 bpf_probe_read_user(&augmented_args->saddr, socklen, sockaddr_arg); 194 195 return augmented__output(args, augmented_args, len + socklen); 196} 197 198SEC("tp/syscalls/sys_enter_sendto") 199int sys_enter_sendto(struct syscall_enter_args *args) 200{ 201 struct augmented_args_payload *augmented_args = augmented_args_payload(); 202 const void *sockaddr_arg = (const void *)args->args[4]; 203 unsigned int socklen = args->args[5]; 204 unsigned int len = sizeof(augmented_args->args); 205 206 if (augmented_args == NULL) 207 return 1; /* Failure: don't filter */ 208 209 socklen &= sizeof(augmented_args->saddr) - 1; 210 211 bpf_probe_read_user(&augmented_args->saddr, socklen, sockaddr_arg); 212 213 return augmented__output(args, augmented_args, len + socklen); 214} 215 216SEC("tp/syscalls/sys_enter_open") 217int sys_enter_open(struct syscall_enter_args *args) 218{ 219 struct augmented_args_payload *augmented_args = augmented_args_payload(); 220 const void *filename_arg = (const void *)args->args[0]; 221 unsigned int len = sizeof(augmented_args->args); 222 223 if (augmented_args == NULL) 224 return 1; /* Failure: don't filter */ 225 226 len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value)); 227 228 return augmented__output(args, augmented_args, len); 229} 230 231SEC("tp/syscalls/sys_enter_openat") 232int sys_enter_openat(struct syscall_enter_args *args) 233{ 234 struct augmented_args_payload *augmented_args = augmented_args_payload(); 235 const void *filename_arg = (const void *)args->args[1]; 236 unsigned int len = sizeof(augmented_args->args); 237 238 if (augmented_args == NULL) 239 return 1; /* Failure: don't filter */ 240 241 len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value)); 242 243 return augmented__output(args, augmented_args, len); 244} 245 246SEC("tp/syscalls/sys_enter_rename") 247int sys_enter_rename(struct syscall_enter_args *args) 248{ 249 struct augmented_args_payload *augmented_args = augmented_args_payload(); 250 const void *oldpath_arg = (const void *)args->args[0], 251 *newpath_arg = (const void *)args->args[1]; 252 unsigned int len = sizeof(augmented_args->args), oldpath_len; 253 254 if (augmented_args == NULL) 255 return 1; /* Failure: don't filter */ 256 257 oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value)); 258 len += oldpath_len + augmented_arg__read_str((void *)(&augmented_args->arg) + oldpath_len, newpath_arg, sizeof(augmented_args->arg.value)); 259 260 return augmented__output(args, augmented_args, len); 261} 262 263SEC("tp/syscalls/sys_enter_renameat") 264int sys_enter_renameat(struct syscall_enter_args *args) 265{ 266 struct augmented_args_payload *augmented_args = augmented_args_payload(); 267 const void *oldpath_arg = (const void *)args->args[1], 268 *newpath_arg = (const void *)args->args[3]; 269 unsigned int len = sizeof(augmented_args->args), oldpath_len; 270 271 if (augmented_args == NULL) 272 return 1; /* Failure: don't filter */ 273 274 oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value)); 275 len += oldpath_len + augmented_arg__read_str((void *)(&augmented_args->arg) + oldpath_len, newpath_arg, sizeof(augmented_args->arg.value)); 276 277 return augmented__output(args, augmented_args, len); 278} 279 280#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ 281 282// we need just the start, get the size to then copy it 283struct perf_event_attr_size { 284 __u32 type; 285 /* 286 * Size of the attr structure, for fwd/bwd compat. 287 */ 288 __u32 size; 289}; 290 291SEC("tp/syscalls/sys_enter_perf_event_open") 292int sys_enter_perf_event_open(struct syscall_enter_args *args) 293{ 294 struct augmented_args_payload *augmented_args = augmented_args_payload(); 295 const struct perf_event_attr_size *attr = (const struct perf_event_attr_size *)args->args[0], *attr_read; 296 unsigned int len = sizeof(augmented_args->args); 297 298 if (augmented_args == NULL) 299 goto failure; 300 301 if (bpf_probe_read_user(&augmented_args->__data, sizeof(*attr), attr) < 0) 302 goto failure; 303 304 attr_read = (const struct perf_event_attr_size *)augmented_args->__data; 305 306 __u32 size = attr_read->size; 307 308 if (!size) 309 size = PERF_ATTR_SIZE_VER0; 310 311 if (size > sizeof(augmented_args->__data)) 312 goto failure; 313 314 // Now that we read attr->size and tested it against the size limits, read it completely 315 if (bpf_probe_read_user(&augmented_args->__data, size, attr) < 0) 316 goto failure; 317 318 return augmented__output(args, augmented_args, len + size); 319failure: 320 return 1; /* Failure: don't filter */ 321} 322 323SEC("tp/syscalls/sys_enter_clock_nanosleep") 324int sys_enter_clock_nanosleep(struct syscall_enter_args *args) 325{ 326 struct augmented_args_payload *augmented_args = augmented_args_payload(); 327 const void *rqtp_arg = (const void *)args->args[2]; 328 unsigned int len = sizeof(augmented_args->args); 329 __u32 size = sizeof(struct timespec64); 330 331 if (augmented_args == NULL) 332 goto failure; 333 334 if (size > sizeof(augmented_args->__data)) 335 goto failure; 336 337 bpf_probe_read_user(&augmented_args->__data, size, rqtp_arg); 338 339 return augmented__output(args, augmented_args, len + size); 340failure: 341 return 1; /* Failure: don't filter */ 342} 343 344static pid_t getpid(void) 345{ 346 return bpf_get_current_pid_tgid(); 347} 348 349static bool pid_filter__has(struct pids_filtered *pids, pid_t pid) 350{ 351 return bpf_map_lookup_elem(pids, &pid) != NULL; 352} 353 354SEC("tp/raw_syscalls/sys_enter") 355int sys_enter(struct syscall_enter_args *args) 356{ 357 struct augmented_args_payload *augmented_args; 358 /* 359 * We start len, the amount of data that will be in the perf ring 360 * buffer, if this is not filtered out by one of pid_filter__has(), 361 * syscall->enabled, etc, with the non-augmented raw syscall payload, 362 * i.e. sizeof(augmented_args->args). 363 * 364 * We'll add to this as we add augmented syscalls right after that 365 * initial, non-augmented raw_syscalls:sys_enter payload. 366 */ 367 368 if (pid_filter__has(&pids_filtered, getpid())) 369 return 0; 370 371 augmented_args = augmented_args_payload(); 372 if (augmented_args == NULL) 373 return 1; 374 375 bpf_probe_read_kernel(&augmented_args->args, sizeof(augmented_args->args), args); 376 377 /* 378 * Jump to syscall specific augmenter, even if the default one, 379 * "!raw_syscalls:unaugmented" that will just return 1 to return the 380 * unaugmented tracepoint payload. 381 */ 382 bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr); 383 384 // If not found on the PROG_ARRAY syscalls map, then we're filtering it: 385 return 0; 386} 387 388SEC("tp/raw_syscalls/sys_exit") 389int sys_exit(struct syscall_exit_args *args) 390{ 391 struct syscall_exit_args exit_args; 392 393 if (pid_filter__has(&pids_filtered, getpid())) 394 return 0; 395 396 bpf_probe_read_kernel(&exit_args, sizeof(exit_args), args); 397 /* 398 * Jump to syscall specific return augmenter, even if the default one, 399 * "!raw_syscalls:unaugmented" that will just return 1 to return the 400 * unaugmented tracepoint payload. 401 */ 402 bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr); 403 /* 404 * If not found on the PROG_ARRAY syscalls map, then we're filtering it: 405 */ 406 return 0; 407} 408 409char _license[] SEC("license") = "GPL"; 410