1// SPDX-License-Identifier: GPL-2.0
2// Copyright (c) 2019 Facebook
3
4#include <stdint.h>
5#include <stddef.h>
6#include <stdbool.h>
7#include <linux/bpf.h>
8#include <linux/ptrace.h>
9#include <linux/sched.h>
10#include <linux/types.h>
11#include <bpf/bpf_helpers.h>
12
13typedef uint32_t pid_t;
14struct task_struct {};
15
16#define TASK_COMM_LEN 16
17#define PERF_MAX_STACK_DEPTH 127
18
19#define STROBE_TYPE_INVALID 0
20#define STROBE_TYPE_INT 1
21#define STROBE_TYPE_STR 2
22#define STROBE_TYPE_MAP 3
23
24#define STACK_TABLE_EPOCH_SHIFT 20
25#define STROBE_MAX_STR_LEN 1
26#define STROBE_MAX_CFGS 32
27#define STROBE_MAX_PAYLOAD						\
28	(STROBE_MAX_STRS * STROBE_MAX_STR_LEN +				\
29	STROBE_MAX_MAPS * (1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN)
30
31struct strobe_value_header {
32	/*
33	 * meaning depends on type:
34	 * 1. int: 0, if value not set, 1 otherwise
35	 * 2. str: 1 always, whether value is set or not is determined by ptr
36	 * 3. map: 1 always, pointer points to additional struct with number
37	 *    of entries (up to STROBE_MAX_MAP_ENTRIES)
38	 */
39	uint16_t len;
40	/*
41	 * _reserved might be used for some future fields/flags, but we always
42	 * want to keep strobe_value_header to be 8 bytes, so BPF can read 16
43	 * bytes in one go and get both header and value
44	 */
45	uint8_t _reserved[6];
46};
47
48/*
49 * strobe_value_generic is used from BPF probe only, but needs to be a union
50 * of strobe_value_int/strobe_value_str/strobe_value_map
51 */
52struct strobe_value_generic {
53	struct strobe_value_header header;
54	union {
55		int64_t val;
56		void *ptr;
57	};
58};
59
60struct strobe_value_int {
61	struct strobe_value_header header;
62	int64_t value;
63};
64
65struct strobe_value_str {
66	struct strobe_value_header header;
67	const char* value;
68};
69
70struct strobe_value_map {
71	struct strobe_value_header header;
72	const struct strobe_map_raw* value;
73};
74
75struct strobe_map_entry {
76	const char* key;
77	const char* val;
78};
79
80/*
81 * Map of C-string key/value pairs with fixed maximum capacity. Each map has
82 * corresponding int64 ID, which application can use (or ignore) in whatever
83 * way appropriate. Map is "write-only", there is no way to get data out of
84 * map. Map is intended to be used to provide metadata for profilers and is
85 * not to be used for internal in-app communication. All methods are
86 * thread-safe.
87 */
88struct strobe_map_raw {
89	/*
90	 * general purpose unique ID that's up to application to decide
91	 * whether and how to use; for request metadata use case id is unique
92	 * request ID that's used to match metadata with stack traces on
93	 * Strobelight backend side
94	 */
95	int64_t id;
96	/* number of used entries in map */
97	int64_t cnt;
98	/*
99	 * having volatile doesn't change anything on BPF side, but clang
100	 * emits warnings for passing `volatile const char *` into
101	 * bpf_probe_read_user_str that expects just `const char *`
102	 */
103	const char* tag;
104	/*
105	 * key/value entries, each consisting of 2 pointers to key and value
106	 * C strings
107	 */
108	struct strobe_map_entry entries[STROBE_MAX_MAP_ENTRIES];
109};
110
111/* Following values define supported values of TLS mode */
112#define TLS_NOT_SET -1
113#define TLS_LOCAL_EXEC 0
114#define TLS_IMM_EXEC 1
115#define TLS_GENERAL_DYN 2
116
117/*
118 * structure that universally represents TLS location (both for static
119 * executables and shared libraries)
120 */
121struct strobe_value_loc {
122	/*
123	 * tls_mode defines what TLS mode was used for particular metavariable:
124	 * - -1 (TLS_NOT_SET) - no metavariable;
125	 * - 0 (TLS_LOCAL_EXEC) - Local Executable mode;
126	 * - 1 (TLS_IMM_EXEC) - Immediate Executable mode;
127	 * - 2 (TLS_GENERAL_DYN) - General Dynamic mode;
128	 * Local Dynamic mode is not yet supported, because never seen in
129	 * practice.  Mode defines how offset field is interpreted. See
130	 * calc_location() in below for details.
131	 */
132	int64_t tls_mode;
133	/*
134	 * TLS_LOCAL_EXEC: offset from thread pointer (fs:0 for x86-64,
135	 * tpidr_el0 for aarch64).
136	 * TLS_IMM_EXEC: absolute address of GOT entry containing offset
137	 * from thread pointer;
138	 * TLS_GENERAL_DYN: absolute addres of double GOT entry
139	 * containing tls_index_t struct;
140	 */
141	int64_t offset;
142};
143
144struct strobemeta_cfg {
145	int64_t req_meta_idx;
146	struct strobe_value_loc int_locs[STROBE_MAX_INTS];
147	struct strobe_value_loc str_locs[STROBE_MAX_STRS];
148	struct strobe_value_loc map_locs[STROBE_MAX_MAPS];
149};
150
151struct strobe_map_descr {
152	uint64_t id;
153	int16_t tag_len;
154	/*
155	 * cnt <0 - map value isn't set;
156	 * 0 - map has id set, but no key/value entries
157	 */
158	int16_t cnt;
159	/*
160	 * both key_lens[i] and val_lens[i] should be >0 for present key/value
161	 * entry
162	 */
163	uint16_t key_lens[STROBE_MAX_MAP_ENTRIES];
164	uint16_t val_lens[STROBE_MAX_MAP_ENTRIES];
165};
166
167struct strobemeta_payload {
168	/* req_id has valid request ID, if req_meta_valid == 1 */
169	int64_t req_id;
170	uint8_t req_meta_valid;
171	/*
172	 * mask has Nth bit set to 1, if Nth metavar was present and
173	 * successfully read
174	 */
175	uint64_t int_vals_set_mask;
176	int64_t int_vals[STROBE_MAX_INTS];
177	/* len is >0 for present values */
178	uint16_t str_lens[STROBE_MAX_STRS];
179	/* if map_descrs[i].cnt == -1, metavar is not present/set */
180	struct strobe_map_descr map_descrs[STROBE_MAX_MAPS];
181	/*
182	 * payload has compactly packed values of str and map variables in the
183	 * form: strval1\0strval2\0map1key1\0map1val1\0map2key1\0map2val1\0
184	 * (and so on); str_lens[i], key_lens[i] and val_lens[i] determines
185	 * value length
186	 */
187	char payload[STROBE_MAX_PAYLOAD];
188};
189
190struct strobelight_bpf_sample {
191	uint64_t ktime;
192	char comm[TASK_COMM_LEN];
193	pid_t pid;
194	int user_stack_id;
195	int kernel_stack_id;
196	int has_meta;
197	struct strobemeta_payload metadata;
198	/*
199	 * makes it possible to pass (<real payload size> + 1) as data size to
200	 * perf_submit() to avoid perf_submit's paranoia about passing zero as
201	 * size, as it deduces that <real payload size> might be
202	 * **theoretically** zero
203	 */
204	char dummy_safeguard;
205};
206
207struct {
208	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
209	__uint(max_entries, 32);
210	__uint(key_size, sizeof(int));
211	__uint(value_size, sizeof(int));
212} samples SEC(".maps");
213
214struct {
215	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
216	__uint(max_entries, 16);
217	__uint(key_size, sizeof(uint32_t));
218	__uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH);
219} stacks_0 SEC(".maps");
220
221struct {
222	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
223	__uint(max_entries, 16);
224	__uint(key_size, sizeof(uint32_t));
225	__uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH);
226} stacks_1 SEC(".maps");
227
228struct {
229	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
230	__uint(max_entries, 1);
231	__type(key, uint32_t);
232	__type(value, struct strobelight_bpf_sample);
233} sample_heap SEC(".maps");
234
235struct {
236	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
237	__uint(max_entries, STROBE_MAX_CFGS);
238	__type(key, pid_t);
239	__type(value, struct strobemeta_cfg);
240} strobemeta_cfgs SEC(".maps");
241
242/* Type for the dtv.  */
243/* https://github.com/lattera/glibc/blob/master/nptl/sysdeps/x86_64/tls.h#L34 */
244typedef union dtv {
245	size_t counter;
246	struct {
247		void* val;
248		bool is_static;
249	} pointer;
250} dtv_t;
251
252/* Partial definition for tcbhead_t */
253/* https://github.com/bminor/glibc/blob/master/sysdeps/x86_64/nptl/tls.h#L42 */
254struct tcbhead {
255	void* tcb;
256	dtv_t* dtv;
257};
258
259/*
260 * TLS module/offset information for shared library case.
261 * For x86-64, this is mapped onto two entries in GOT.
262 * For aarch64, this is pointed to by second GOT entry.
263 */
264struct tls_index {
265	uint64_t module;
266	uint64_t offset;
267};
268
269#ifdef SUBPROGS
270__noinline
271#else
272__always_inline
273#endif
274static void *calc_location(struct strobe_value_loc *loc, void *tls_base)
275{
276	/*
277	 * tls_mode value is:
278	 * - -1 (TLS_NOT_SET), if no metavar is present;
279	 * - 0 (TLS_LOCAL_EXEC), if metavar uses Local Executable mode of TLS
280	 * (offset from fs:0 for x86-64 or tpidr_el0 for aarch64);
281	 * - 1 (TLS_IMM_EXEC), if metavar uses Immediate Executable mode of TLS;
282	 * - 2 (TLS_GENERAL_DYN), if metavar uses General Dynamic mode of TLS;
283	 * This schema allows to use something like:
284	 * (tls_mode + 1) * (tls_base + offset)
285	 * to get NULL for "no metavar" location, or correct pointer for local
286	 * executable mode without doing extra ifs.
287	 */
288	if (loc->tls_mode <= TLS_LOCAL_EXEC) {
289		/* static executable is simple, we just have offset from
290		 * tls_base */
291		void *addr = tls_base + loc->offset;
292		/* multiply by (tls_mode + 1) to get NULL, if we have no
293		 * metavar in this slot */
294		return (void *)((loc->tls_mode + 1) * (int64_t)addr);
295	}
296	/*
297	 * Other modes are more complicated, we need to jump through few hoops.
298	 *
299	 * For immediate executable mode (currently supported only for aarch64):
300	 *  - loc->offset is pointing to a GOT entry containing fixed offset
301	 *  relative to tls_base;
302	 *
303	 * For general dynamic mode:
304	 *  - loc->offset is pointing to a beginning of double GOT entries;
305	 *  - (for aarch64 only) second entry points to tls_index_t struct;
306	 *  - (for x86-64 only) two GOT entries are already tls_index_t;
307	 *  - tls_index_t->module is used to find start of TLS section in
308	 *  which variable resides;
309	 *  - tls_index_t->offset provides offset within that TLS section,
310	 *  pointing to value of variable.
311	 */
312	struct tls_index tls_index;
313	dtv_t *dtv;
314	void *tls_ptr;
315
316	bpf_probe_read_user(&tls_index, sizeof(struct tls_index),
317			    (void *)loc->offset);
318	/* valid module index is always positive */
319	if (tls_index.module > 0) {
320		/* dtv = ((struct tcbhead *)tls_base)->dtv[tls_index.module] */
321		bpf_probe_read_user(&dtv, sizeof(dtv),
322				    &((struct tcbhead *)tls_base)->dtv);
323		dtv += tls_index.module;
324	} else {
325		dtv = NULL;
326	}
327	bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv);
328	/* if pointer has (void *)-1 value, then TLS wasn't initialized yet */
329	return tls_ptr && tls_ptr != (void *)-1
330		? tls_ptr + tls_index.offset
331		: NULL;
332}
333
334#ifdef SUBPROGS
335__noinline
336#else
337__always_inline
338#endif
339static void read_int_var(struct strobemeta_cfg *cfg,
340			 size_t idx, void *tls_base,
341			 struct strobe_value_generic *value,
342			 struct strobemeta_payload *data)
343{
344	void *location = calc_location(&cfg->int_locs[idx], tls_base);
345	if (!location)
346		return;
347
348	bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
349	data->int_vals[idx] = value->val;
350	if (value->header.len)
351		data->int_vals_set_mask |= (1 << idx);
352}
353
354static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg,
355					     size_t idx, void *tls_base,
356					     struct strobe_value_generic *value,
357					     struct strobemeta_payload *data,
358					     void *payload)
359{
360	void *location;
361	uint64_t len;
362
363	data->str_lens[idx] = 0;
364	location = calc_location(&cfg->str_locs[idx], tls_base);
365	if (!location)
366		return 0;
367
368	bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
369	len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, value->ptr);
370	/*
371	 * if bpf_probe_read_user_str returns error (<0), due to casting to
372	 * unsinged int, it will become big number, so next check is
373	 * sufficient to check for errors AND prove to BPF verifier, that
374	 * bpf_probe_read_user_str won't return anything bigger than
375	 * STROBE_MAX_STR_LEN
376	 */
377	if (len > STROBE_MAX_STR_LEN)
378		return 0;
379
380	data->str_lens[idx] = len;
381	return len;
382}
383
384static __always_inline void *read_map_var(struct strobemeta_cfg *cfg,
385					  size_t idx, void *tls_base,
386					  struct strobe_value_generic *value,
387					  struct strobemeta_payload *data,
388					  void *payload)
389{
390	struct strobe_map_descr* descr = &data->map_descrs[idx];
391	struct strobe_map_raw map;
392	void *location;
393	uint64_t len;
394	int i;
395
396	descr->tag_len = 0; /* presume no tag is set */
397	descr->cnt = -1; /* presume no value is set */
398
399	location = calc_location(&cfg->map_locs[idx], tls_base);
400	if (!location)
401		return payload;
402
403	bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
404	if (bpf_probe_read_user(&map, sizeof(struct strobe_map_raw), value->ptr))
405		return payload;
406
407	descr->id = map.id;
408	descr->cnt = map.cnt;
409	if (cfg->req_meta_idx == idx) {
410		data->req_id = map.id;
411		data->req_meta_valid = 1;
412	}
413
414	len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, map.tag);
415	if (len <= STROBE_MAX_STR_LEN) {
416		descr->tag_len = len;
417		payload += len;
418	}
419
420#ifdef NO_UNROLL
421#pragma clang loop unroll(disable)
422#else
423#pragma unroll
424#endif
425	for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) {
426		if (i >= map.cnt)
427			break;
428
429		descr->key_lens[i] = 0;
430		len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN,
431					      map.entries[i].key);
432		if (len <= STROBE_MAX_STR_LEN) {
433			descr->key_lens[i] = len;
434			payload += len;
435		}
436		descr->val_lens[i] = 0;
437		len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN,
438					      map.entries[i].val);
439		if (len <= STROBE_MAX_STR_LEN) {
440			descr->val_lens[i] = len;
441			payload += len;
442		}
443	}
444
445	return payload;
446}
447
448#ifdef USE_BPF_LOOP
449enum read_type {
450	READ_INT_VAR,
451	READ_MAP_VAR,
452	READ_STR_VAR,
453};
454
455struct read_var_ctx {
456	struct strobemeta_payload *data;
457	void *tls_base;
458	struct strobemeta_cfg *cfg;
459	void *payload;
460	/* value gets mutated */
461	struct strobe_value_generic *value;
462	enum read_type type;
463};
464
465static int read_var_callback(__u32 index, struct read_var_ctx *ctx)
466{
467	switch (ctx->type) {
468	case READ_INT_VAR:
469		if (index >= STROBE_MAX_INTS)
470			return 1;
471		read_int_var(ctx->cfg, index, ctx->tls_base, ctx->value, ctx->data);
472		break;
473	case READ_MAP_VAR:
474		if (index >= STROBE_MAX_MAPS)
475			return 1;
476		ctx->payload = read_map_var(ctx->cfg, index, ctx->tls_base,
477					    ctx->value, ctx->data, ctx->payload);
478		break;
479	case READ_STR_VAR:
480		if (index >= STROBE_MAX_STRS)
481			return 1;
482		ctx->payload += read_str_var(ctx->cfg, index, ctx->tls_base,
483					     ctx->value, ctx->data, ctx->payload);
484		break;
485	}
486	return 0;
487}
488#endif /* USE_BPF_LOOP */
489
490/*
491 * read_strobe_meta returns NULL, if no metadata was read; otherwise returns
492 * pointer to *right after* payload ends
493 */
494#ifdef SUBPROGS
495__noinline
496#else
497__always_inline
498#endif
499static void *read_strobe_meta(struct task_struct *task,
500			      struct strobemeta_payload *data)
501{
502	pid_t pid = bpf_get_current_pid_tgid() >> 32;
503	struct strobe_value_generic value = {0};
504	struct strobemeta_cfg *cfg;
505	void *tls_base, *payload;
506
507	cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid);
508	if (!cfg)
509		return NULL;
510
511	data->int_vals_set_mask = 0;
512	data->req_meta_valid = 0;
513	payload = data->payload;
514	/*
515	 * we don't have struct task_struct definition, it should be:
516	 * tls_base = (void *)task->thread.fsbase;
517	 */
518	tls_base = (void *)task;
519
520#ifdef USE_BPF_LOOP
521	struct read_var_ctx ctx = {
522		.cfg = cfg,
523		.tls_base = tls_base,
524		.value = &value,
525		.data = data,
526		.payload = payload,
527	};
528	int err;
529
530	ctx.type = READ_INT_VAR;
531	err = bpf_loop(STROBE_MAX_INTS, read_var_callback, &ctx, 0);
532	if (err != STROBE_MAX_INTS)
533		return NULL;
534
535	ctx.type = READ_STR_VAR;
536	err = bpf_loop(STROBE_MAX_STRS, read_var_callback, &ctx, 0);
537	if (err != STROBE_MAX_STRS)
538		return NULL;
539
540	ctx.type = READ_MAP_VAR;
541	err = bpf_loop(STROBE_MAX_MAPS, read_var_callback, &ctx, 0);
542	if (err != STROBE_MAX_MAPS)
543		return NULL;
544#else
545#ifdef NO_UNROLL
546#pragma clang loop unroll(disable)
547#else
548#pragma unroll
549#endif /* NO_UNROLL */
550	for (int i = 0; i < STROBE_MAX_INTS; ++i) {
551		read_int_var(cfg, i, tls_base, &value, data);
552	}
553#ifdef NO_UNROLL
554#pragma clang loop unroll(disable)
555#else
556#pragma unroll
557#endif /* NO_UNROLL */
558	for (int i = 0; i < STROBE_MAX_STRS; ++i) {
559		payload += read_str_var(cfg, i, tls_base, &value, data, payload);
560	}
561#ifdef NO_UNROLL
562#pragma clang loop unroll(disable)
563#else
564#pragma unroll
565#endif /* NO_UNROLL */
566	for (int i = 0; i < STROBE_MAX_MAPS; ++i) {
567		payload = read_map_var(cfg, i, tls_base, &value, data, payload);
568	}
569#endif /* USE_BPF_LOOP */
570
571	/*
572	 * return pointer right after end of payload, so it's possible to
573	 * calculate exact amount of useful data that needs to be sent
574	 */
575	return payload;
576}
577
578SEC("raw_tracepoint/kfree_skb")
579int on_event(struct pt_regs *ctx) {
580	pid_t pid =  bpf_get_current_pid_tgid() >> 32;
581	struct strobelight_bpf_sample* sample;
582	struct task_struct *task;
583	uint32_t zero = 0;
584	uint64_t ktime_ns;
585	void *sample_end;
586
587	sample = bpf_map_lookup_elem(&sample_heap, &zero);
588	if (!sample)
589		return 0; /* this will never happen */
590
591	sample->pid = pid;
592	bpf_get_current_comm(&sample->comm, TASK_COMM_LEN);
593	ktime_ns = bpf_ktime_get_ns();
594	sample->ktime = ktime_ns;
595
596	task = (struct task_struct *)bpf_get_current_task();
597	sample_end = read_strobe_meta(task, &sample->metadata);
598	sample->has_meta = sample_end != NULL;
599	sample_end = sample_end ? : &sample->metadata;
600
601	if ((ktime_ns >> STACK_TABLE_EPOCH_SHIFT) & 1) {
602		sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_1, 0);
603		sample->user_stack_id = bpf_get_stackid(ctx, &stacks_1, BPF_F_USER_STACK);
604	} else {
605		sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_0, 0);
606		sample->user_stack_id = bpf_get_stackid(ctx, &stacks_0, BPF_F_USER_STACK);
607	}
608
609	uint64_t sample_size = sample_end - (void *)sample;
610	/* should always be true */
611	if (sample_size < sizeof(struct strobelight_bpf_sample))
612		bpf_perf_event_output(ctx, &samples, 0, sample, 1 + sample_size);
613	return 0;
614}
615
616char _license[] SEC("license") = "GPL";
617