1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2017, 2018, 2019, 2021 BMW Car IT GmbH
4 * Author: Viktor Rosendahl (viktor.rosendahl@bmw.de)
5 */
6
7#define _GNU_SOURCE
8#define _POSIX_C_SOURCE 200809L
9
10#include <ctype.h>
11#include <stdbool.h>
12#include <stdio.h>
13#include <stdlib.h>
14#include <string.h>
15
16#include <err.h>
17#include <errno.h>
18#include <fcntl.h>
19#include <getopt.h>
20#include <sched.h>
21#include <linux/unistd.h>
22#include <signal.h>
23#include <sys/inotify.h>
24#include <unistd.h>
25#include <pthread.h>
26#include <tracefs.h>
27
28static const char *prg_name;
29static const char *prg_unknown = "unknown program name";
30
31static int fd_stdout;
32
33static int sched_policy;
34static bool sched_policy_set;
35
36static int sched_pri;
37static bool sched_pri_set;
38
39static bool trace_enable = true;
40static bool setup_ftrace = true;
41static bool use_random_sleep;
42
43#define TRACE_OPTS				\
44	C(FUNC_TR, "function-trace"),		\
45	C(DISP_GR, "display-graph"),		\
46	C(NR,       NULL)
47
48#undef C
49#define C(a, b) OPTIDX_##a
50
51enum traceopt {
52	TRACE_OPTS
53};
54
55#undef C
56#define C(a, b)  b
57
58static const char *const optstr[] = {
59	TRACE_OPTS
60};
61
62enum errhandling {
63	ERR_EXIT = 0,
64	ERR_WARN,
65	ERR_CLEANUP,
66};
67
68static bool use_options[OPTIDX_NR];
69
70static char inotify_buffer[655360];
71
72#define likely(x)      __builtin_expect(!!(x), 1)
73#define unlikely(x)    __builtin_expect(!!(x), 0)
74#define bool2str(x)    (x ? "true":"false")
75
76#define DEFAULT_NR_PRINTER_THREADS (3)
77static unsigned int nr_threads = DEFAULT_NR_PRINTER_THREADS;
78
79#define DEFAULT_TABLE_SIZE (2)
80static unsigned int table_startsize = DEFAULT_TABLE_SIZE;
81
82static int verbosity;
83
84#define verbose_sizechange() (verbosity >= 1)
85#define verbose_lostevent()  (verbosity >= 2)
86#define verbose_ftrace()     (verbosity >= 1)
87
88#define was_changed(ORIG, CUR) (strcmp(ORIG, CUR) != 0)
89#define needs_change(CUR, WANTED) (strcmp(CUR, WANTED) != 0)
90
91static const char *debug_tracefile;
92static const char *debug_tracefile_dflt;
93static const char *debug_maxlat;
94static const char *debug_maxlat_dflt;
95static const char * const DEBUG_NOFILE = "[file not found]";
96
97static const char * const TR_MAXLAT  = "tracing_max_latency";
98static const char * const TR_THRESH  = "tracing_thresh";
99static const char * const TR_CURRENT = "current_tracer";
100static const char * const TR_OPTIONS = "trace_options";
101
102static const char * const NOP_TRACER = "nop";
103
104static const char * const OPT_NO_PREFIX = "no";
105
106#define DFLT_THRESHOLD_US "0"
107static const char *threshold = DFLT_THRESHOLD_US;
108
109#define DEV_URANDOM     "/dev/urandom"
110#define RT_DEFAULT_PRI (99)
111#define DEFAULT_PRI    (0)
112
113#define USEC_PER_MSEC (1000L)
114#define NSEC_PER_USEC (1000L)
115#define NSEC_PER_MSEC (USEC_PER_MSEC * NSEC_PER_USEC)
116
117#define MSEC_PER_SEC (1000L)
118#define USEC_PER_SEC (USEC_PER_MSEC * MSEC_PER_SEC)
119#define NSEC_PER_SEC (NSEC_PER_MSEC * MSEC_PER_SEC)
120
121#define SLEEP_TIME_MS_DEFAULT (1000L)
122#define TRY_PRINTMUTEX_MS (1000)
123
124static long sleep_time = (USEC_PER_MSEC * SLEEP_TIME_MS_DEFAULT);
125
126static const char * const queue_full_warning =
127"Could not queue trace for printing. It is likely that events happen faster\n"
128"than what they can be printed. Probably partly because of random sleeping\n";
129
130static const char * const no_tracer_msg =
131"Could not find any tracers! Running this program as root may help!\n";
132
133static const char * const no_latency_tr_msg =
134"No latency tracers are supported by your kernel!\n";
135
136struct policy {
137	const char *name;
138	int policy;
139	int default_pri;
140};
141
142static const struct policy policies[] = {
143	{ "other", SCHED_OTHER, DEFAULT_PRI    },
144	{ "batch", SCHED_BATCH, DEFAULT_PRI    },
145	{ "idle",  SCHED_IDLE,  DEFAULT_PRI    },
146	{ "rr",    SCHED_RR,    RT_DEFAULT_PRI },
147	{ "fifo",  SCHED_FIFO,  RT_DEFAULT_PRI },
148	{ NULL,    0,           DEFAULT_PRI    }
149};
150
151/*
152 * The default tracer will be the first on this list that is supported by the
153 * currently running Linux kernel.
154 */
155static const char * const relevant_tracers[] = {
156	"preemptirqsoff",
157	"preemptoff",
158	"irqsoff",
159	"wakeup",
160	"wakeup_rt",
161	"wakeup_dl",
162	NULL
163};
164
165/* This is the list of tracers for which random sleep makes sense */
166static const char * const random_tracers[] = {
167	"preemptirqsoff",
168	"preemptoff",
169	"irqsoff",
170	NULL
171};
172
173static const char *current_tracer;
174static bool force_tracer;
175
176struct ftrace_state {
177	char *tracer;
178	char *thresh;
179	bool opt[OPTIDX_NR];
180	bool opt_valid[OPTIDX_NR];
181	pthread_mutex_t mutex;
182};
183
184struct entry {
185	int ticket;
186	int ticket_completed_ref;
187};
188
189struct print_state {
190	int ticket_counter;
191	int ticket_completed;
192	pthread_mutex_t mutex;
193	pthread_cond_t cond;
194	int cnt;
195	pthread_mutex_t cnt_mutex;
196};
197
198struct short_msg {
199	char buf[160];
200	int len;
201};
202
203static struct print_state printstate;
204static struct ftrace_state save_state;
205volatile sig_atomic_t signal_flag;
206
207#define PROB_TABLE_MAX_SIZE (1000)
208
209int probabilities[PROB_TABLE_MAX_SIZE];
210
211struct sleep_table {
212	int *table;
213	int size;
214	pthread_mutex_t mutex;
215};
216
217static struct sleep_table sleeptable;
218
219#define QUEUE_SIZE (10)
220
221struct queue {
222	struct entry entries[QUEUE_SIZE];
223	int next_prod_idx;
224	int next_cons_idx;
225	pthread_mutex_t mutex;
226	pthread_cond_t cond;
227};
228
229#define MAX_THREADS (40)
230
231struct queue printqueue;
232pthread_t printthread[MAX_THREADS];
233pthread_mutex_t print_mtx;
234#define PRINT_BUFFER_SIZE (16 * 1024 * 1024)
235
236static void cleanup_exit(int status);
237static int set_trace_opt(const char *opt, bool value);
238
239static __always_inline void *malloc_or_die(size_t size)
240{
241	void *ptr = malloc(size);
242
243	if (unlikely(ptr == NULL)) {
244		warn("malloc() failed");
245		cleanup_exit(EXIT_FAILURE);
246	}
247	return ptr;
248}
249
250static __always_inline void *malloc_or_die_nocleanup(size_t size)
251{
252	void *ptr = malloc(size);
253
254	if (unlikely(ptr == NULL))
255		err(0, "malloc() failed");
256	return ptr;
257}
258
259static __always_inline void write_or_die(int fd, const char *buf, size_t count)
260{
261	ssize_t r;
262
263	do {
264		r = write(fd, buf, count);
265		if (unlikely(r < 0)) {
266			if (errno == EINTR)
267				continue;
268			warn("write() failed");
269			cleanup_exit(EXIT_FAILURE);
270		}
271		count -= r;
272		buf += r;
273	} while (count > 0);
274}
275
276static __always_inline void clock_gettime_or_die(clockid_t clk_id,
277						 struct timespec *tp)
278{
279	int r = clock_gettime(clk_id, tp);
280
281	if (unlikely(r != 0))
282		err(EXIT_FAILURE, "clock_gettime() failed");
283}
284
285static __always_inline void sigemptyset_or_die(sigset_t *s)
286{
287	if (unlikely(sigemptyset(s) != 0)) {
288		warn("sigemptyset() failed");
289		cleanup_exit(EXIT_FAILURE);
290	}
291}
292
293static __always_inline void sigaddset_or_die(sigset_t *s, int signum)
294{
295	if (unlikely(sigaddset(s, signum) != 0)) {
296		warn("sigemptyset() failed");
297		cleanup_exit(EXIT_FAILURE);
298	}
299}
300
301static __always_inline void sigaction_or_die(int signum,
302					     const struct sigaction *act,
303					     struct sigaction *oldact)
304{
305	if (unlikely(sigaction(signum, act, oldact) != 0)) {
306		warn("sigaction() failed");
307		cleanup_exit(EXIT_FAILURE);
308	}
309}
310
311static void open_stdout(void)
312{
313	if (setvbuf(stdout, NULL, _IONBF, 0) != 0)
314		err(EXIT_FAILURE, "setvbuf() failed");
315	fd_stdout = fileno(stdout);
316	if (fd_stdout < 0)
317		err(EXIT_FAILURE, "fileno() failed");
318}
319
320/*
321 * It's not worth it to call cleanup_exit() from mutex functions because
322 * cleanup_exit() uses mutexes.
323 */
324static __always_inline void mutex_lock(pthread_mutex_t *mtx)
325{
326	errno = pthread_mutex_lock(mtx);
327	if (unlikely(errno))
328		err(EXIT_FAILURE, "pthread_mutex_lock() failed");
329}
330
331
332static __always_inline void mutex_unlock(pthread_mutex_t *mtx)
333{
334	errno = pthread_mutex_unlock(mtx);
335	if (unlikely(errno))
336		err(EXIT_FAILURE, "pthread_mutex_unlock() failed");
337}
338
339static __always_inline void cond_signal(pthread_cond_t *cond)
340{
341	errno = pthread_cond_signal(cond);
342	if (unlikely(errno))
343		err(EXIT_FAILURE, "pthread_cond_signal() failed");
344}
345
346static __always_inline void cond_wait(pthread_cond_t *restrict cond,
347				      pthread_mutex_t *restrict mutex)
348{
349	errno = pthread_cond_wait(cond, mutex);
350	if (unlikely(errno))
351		err(EXIT_FAILURE, "pthread_cond_wait() failed");
352}
353
354static __always_inline void cond_broadcast(pthread_cond_t *cond)
355{
356	errno = pthread_cond_broadcast(cond);
357	if (unlikely(errno))
358		err(EXIT_FAILURE, "pthread_cond_broadcast() failed");
359}
360
361static __always_inline void
362mutex_init(pthread_mutex_t *mutex,
363	   const pthread_mutexattr_t *attr)
364{
365	errno = pthread_mutex_init(mutex, attr);
366	if (errno)
367		err(EXIT_FAILURE, "pthread_mutex_init() failed");
368}
369
370static __always_inline void mutexattr_init(pthread_mutexattr_t *attr)
371{
372	errno = pthread_mutexattr_init(attr);
373	if (errno)
374		err(EXIT_FAILURE, "pthread_mutexattr_init() failed");
375}
376
377static __always_inline void mutexattr_destroy(pthread_mutexattr_t *attr)
378{
379	errno = pthread_mutexattr_destroy(attr);
380	if (errno)
381		err(EXIT_FAILURE, "pthread_mutexattr_destroy() failed");
382}
383
384static __always_inline void mutexattr_settype(pthread_mutexattr_t *attr,
385					      int type)
386{
387	errno = pthread_mutexattr_settype(attr, type);
388	if (errno)
389		err(EXIT_FAILURE, "pthread_mutexattr_settype() failed");
390}
391
392static __always_inline void condattr_init(pthread_condattr_t *attr)
393{
394	errno = pthread_condattr_init(attr);
395	if (errno)
396		err(EXIT_FAILURE, "pthread_condattr_init() failed");
397}
398
399static __always_inline void condattr_destroy(pthread_condattr_t *attr)
400{
401	errno = pthread_condattr_destroy(attr);
402	if (errno)
403		err(EXIT_FAILURE, "pthread_condattr_destroy() failed");
404}
405
406static __always_inline void condattr_setclock(pthread_condattr_t *attr,
407					      clockid_t clock_id)
408{
409	errno = pthread_condattr_setclock(attr, clock_id);
410	if (unlikely(errno))
411		err(EXIT_FAILURE, "pthread_condattr_setclock() failed");
412}
413
414static __always_inline void cond_init(pthread_cond_t *cond,
415				      const pthread_condattr_t *attr)
416{
417	errno = pthread_cond_init(cond, attr);
418	if (errno)
419		err(EXIT_FAILURE, "pthread_cond_init() failed");
420}
421
422static __always_inline int
423cond_timedwait(pthread_cond_t *restrict cond,
424	       pthread_mutex_t *restrict mutex,
425	       const struct timespec *restrict abstime)
426{
427	errno = pthread_cond_timedwait(cond, mutex, abstime);
428	if (errno && errno != ETIMEDOUT)
429		err(EXIT_FAILURE, "pthread_cond_timedwait() failed");
430	return errno;
431}
432
433static void init_printstate(void)
434{
435	pthread_condattr_t cattr;
436
437	printstate.ticket_counter = 0;
438	printstate.ticket_completed = 0;
439	printstate.cnt = 0;
440
441	mutex_init(&printstate.mutex, NULL);
442
443	condattr_init(&cattr);
444	condattr_setclock(&cattr, CLOCK_MONOTONIC);
445	cond_init(&printstate.cond, &cattr);
446	condattr_destroy(&cattr);
447}
448
449static void init_print_mtx(void)
450{
451	pthread_mutexattr_t mattr;
452
453	mutexattr_init(&mattr);
454	mutexattr_settype(&mattr, PTHREAD_MUTEX_RECURSIVE);
455	mutex_init(&print_mtx, &mattr);
456	mutexattr_destroy(&mattr);
457
458}
459
460static void signal_blocking(int how)
461{
462	sigset_t s;
463
464	sigemptyset_or_die(&s);
465	sigaddset_or_die(&s, SIGHUP);
466	sigaddset_or_die(&s, SIGTERM);
467	sigaddset_or_die(&s, SIGINT);
468
469	errno = pthread_sigmask(how, &s, NULL);
470	if (unlikely(errno)) {
471		warn("pthread_sigmask() failed");
472		cleanup_exit(EXIT_FAILURE);
473	}
474}
475
476static void signal_handler(int num)
477{
478	signal_flag = num;
479}
480
481static void setup_sig_handler(void)
482{
483	struct sigaction sa;
484
485	memset(&sa, 0, sizeof(sa));
486	sa.sa_handler = signal_handler;
487
488	sigaction_or_die(SIGHUP, &sa, NULL);
489	sigaction_or_die(SIGTERM, &sa, NULL);
490	sigaction_or_die(SIGINT, &sa, NULL);
491}
492
493static void process_signal(int signal)
494{
495	char *name;
496
497	name = strsignal(signal);
498	if (name == NULL)
499		printf("Received signal %d\n", signal);
500	else
501		printf("Received signal %d (%s)\n", signal, name);
502	cleanup_exit(EXIT_SUCCESS);
503}
504
505static __always_inline void check_signals(void)
506{
507	int signal = signal_flag;
508
509	if (unlikely(signal))
510		process_signal(signal);
511}
512
513static __always_inline void get_time_in_future(struct timespec *future,
514					       long time_us)
515{
516	long nsec;
517
518	clock_gettime_or_die(CLOCK_MONOTONIC, future);
519	future->tv_sec += time_us / USEC_PER_SEC;
520	nsec = future->tv_nsec + (time_us * NSEC_PER_USEC) % NSEC_PER_SEC;
521	if (nsec >= NSEC_PER_SEC) {
522		future->tv_nsec = nsec % NSEC_PER_SEC;
523		future->tv_sec += 1;
524	}
525}
526
527static __always_inline bool time_has_passed(const struct timespec *time)
528{
529	struct timespec now;
530
531	clock_gettime_or_die(CLOCK_MONOTONIC, &now);
532	if (now.tv_sec > time->tv_sec)
533		return true;
534	if (now.tv_sec < time->tv_sec)
535		return false;
536	return (now.tv_nsec >= time->tv_nsec);
537}
538
539static bool mutex_trylock_limit(pthread_mutex_t *mutex, int time_ms)
540{
541	long time_us = time_ms * USEC_PER_MSEC;
542	struct timespec limit;
543
544	get_time_in_future(&limit, time_us);
545	do {
546		errno =  pthread_mutex_trylock(mutex);
547		if (errno && errno != EBUSY)
548			err(EXIT_FAILURE, "pthread_mutex_trylock() failed");
549	} while (errno && !time_has_passed(&limit));
550	return errno == 0;
551}
552
553static void restore_trace_opts(const struct ftrace_state *state,
554				const bool *cur)
555{
556	int i;
557	int r;
558
559	for (i = 0; i < OPTIDX_NR; i++)
560		if (state->opt_valid[i] && state->opt[i] != cur[i]) {
561			r = set_trace_opt(optstr[i], state->opt[i]);
562			if (r < 0)
563				warnx("Failed to restore the %s option to %s",
564				      optstr[i], bool2str(state->opt[i]));
565			else if (verbose_ftrace())
566				printf("Restored the %s option in %s to %s\n",
567				       optstr[i], TR_OPTIONS,
568				       bool2str(state->opt[i]));
569		}
570}
571
572static char *read_file(const char *file, enum errhandling h)
573{
574	int psize;
575	char *r;
576	static const char *emsg = "Failed to read the %s file";
577
578	r = tracefs_instance_file_read(NULL, file, &psize);
579	if (!r) {
580		if (h) {
581			warn(emsg, file);
582			if (h == ERR_CLEANUP)
583				cleanup_exit(EXIT_FAILURE);
584		} else
585			errx(EXIT_FAILURE, emsg, file);
586	}
587
588	if (r && r[psize - 1] == '\n')
589		r[psize - 1] = '\0';
590	return r;
591}
592
593static void restore_file(const char *file, char **saved, const char *cur)
594{
595	if (*saved && was_changed(*saved, cur)) {
596		if (tracefs_instance_file_write(NULL, file, *saved) < 0)
597			warnx("Failed to restore %s to %s!", file, *saved);
598		else if (verbose_ftrace())
599			printf("Restored %s to %s\n", file, *saved);
600		free(*saved);
601		*saved = NULL;
602	}
603}
604
605static void restore_ftrace(void)
606{
607	mutex_lock(&save_state.mutex);
608
609	restore_file(TR_CURRENT, &save_state.tracer, current_tracer);
610	restore_file(TR_THRESH, &save_state.thresh, threshold);
611	restore_trace_opts(&save_state, use_options);
612
613	mutex_unlock(&save_state.mutex);
614}
615
616static void cleanup_exit(int status)
617{
618	char *maxlat;
619
620	if (!setup_ftrace)
621		exit(status);
622
623	/*
624	 * We try the print_mtx for 1 sec in order to avoid garbled
625	 * output if possible, but if it cannot be obtained we proceed anyway.
626	 */
627	mutex_trylock_limit(&print_mtx, TRY_PRINTMUTEX_MS);
628
629	maxlat = read_file(TR_MAXLAT, ERR_WARN);
630	if (maxlat) {
631		printf("The maximum detected latency was: %sus\n", maxlat);
632		free(maxlat);
633	}
634
635	restore_ftrace();
636	/*
637	 * We do not need to unlock the print_mtx here because we will exit at
638	 * the end of this function. Unlocking print_mtx causes problems if a
639	 * print thread happens to be waiting for the mutex because we have
640	 * just changed the ftrace settings to the original and thus the
641	 * print thread would output incorrect data from ftrace.
642	 */
643	exit(status);
644}
645
646static void init_save_state(void)
647{
648	pthread_mutexattr_t mattr;
649
650	mutexattr_init(&mattr);
651	mutexattr_settype(&mattr, PTHREAD_MUTEX_RECURSIVE);
652	mutex_init(&save_state.mutex, &mattr);
653	mutexattr_destroy(&mattr);
654
655	save_state.tracer = NULL;
656	save_state.thresh = NULL;
657	save_state.opt_valid[OPTIDX_FUNC_TR] = false;
658	save_state.opt_valid[OPTIDX_DISP_GR] = false;
659}
660
661static int printstate_next_ticket(struct entry *req)
662{
663	int r;
664
665	r = ++(printstate.ticket_counter);
666	req->ticket = r;
667	req->ticket_completed_ref = printstate.ticket_completed;
668	cond_broadcast(&printstate.cond);
669	return r;
670}
671
672static __always_inline
673void printstate_mark_req_completed(const struct entry *req)
674{
675	if (req->ticket > printstate.ticket_completed)
676		printstate.ticket_completed = req->ticket;
677}
678
679static __always_inline
680bool printstate_has_new_req_arrived(const struct entry *req)
681{
682	return (printstate.ticket_counter != req->ticket);
683}
684
685static __always_inline int printstate_cnt_inc(void)
686{
687	int value;
688
689	mutex_lock(&printstate.cnt_mutex);
690	value = ++printstate.cnt;
691	mutex_unlock(&printstate.cnt_mutex);
692	return value;
693}
694
695static __always_inline int printstate_cnt_dec(void)
696{
697	int value;
698
699	mutex_lock(&printstate.cnt_mutex);
700	value = --printstate.cnt;
701	mutex_unlock(&printstate.cnt_mutex);
702	return value;
703}
704
705static __always_inline int printstate_cnt_read(void)
706{
707	int value;
708
709	mutex_lock(&printstate.cnt_mutex);
710	value = printstate.cnt;
711	mutex_unlock(&printstate.cnt_mutex);
712	return value;
713}
714
715static __always_inline
716bool prev_req_won_race(const struct entry *req)
717{
718	return (printstate.ticket_completed != req->ticket_completed_ref);
719}
720
721static void sleeptable_resize(int size, bool printout, struct short_msg *msg)
722{
723	int bytes;
724
725	if (printout) {
726		msg->len = 0;
727		if (unlikely(size > PROB_TABLE_MAX_SIZE))
728			bytes = snprintf(msg->buf, sizeof(msg->buf),
729"Cannot increase probability table to %d (maximum size reached)\n", size);
730		else
731			bytes = snprintf(msg->buf, sizeof(msg->buf),
732"Increasing probability table to %d\n", size);
733		if (bytes < 0)
734			warn("snprintf() failed");
735		else
736			msg->len = bytes;
737	}
738
739	if (unlikely(size < 0)) {
740		/* Should never happen */
741		warnx("Bad program state at %s:%d", __FILE__, __LINE__);
742		cleanup_exit(EXIT_FAILURE);
743		return;
744	}
745	sleeptable.size = size;
746	sleeptable.table = &probabilities[PROB_TABLE_MAX_SIZE - size];
747}
748
749static void init_probabilities(void)
750{
751	int i;
752	int j = 1000;
753
754	for (i = 0; i < PROB_TABLE_MAX_SIZE; i++) {
755		probabilities[i] = 1000 / j;
756		j--;
757	}
758	mutex_init(&sleeptable.mutex, NULL);
759}
760
761static int table_get_probability(const struct entry *req,
762				 struct short_msg *msg)
763{
764	int diff = req->ticket - req->ticket_completed_ref;
765	int rval = 0;
766
767	msg->len = 0;
768	diff--;
769	/* Should never happen...*/
770	if (unlikely(diff < 0)) {
771		warnx("Programmer assumption error at %s:%d\n", __FILE__,
772		      __LINE__);
773		cleanup_exit(EXIT_FAILURE);
774	}
775	mutex_lock(&sleeptable.mutex);
776	if (diff >= (sleeptable.size - 1)) {
777		rval = sleeptable.table[sleeptable.size - 1];
778		sleeptable_resize(sleeptable.size + 1, verbose_sizechange(),
779				  msg);
780	} else {
781		rval = sleeptable.table[diff];
782	}
783	mutex_unlock(&sleeptable.mutex);
784	return rval;
785}
786
787static void init_queue(struct queue *q)
788{
789	q->next_prod_idx = 0;
790	q->next_cons_idx = 0;
791	mutex_init(&q->mutex, NULL);
792	errno = pthread_cond_init(&q->cond, NULL);
793	if (errno)
794		err(EXIT_FAILURE, "pthread_cond_init() failed");
795}
796
797static __always_inline int queue_len(const struct queue *q)
798{
799	if (q->next_prod_idx >= q->next_cons_idx)
800		return q->next_prod_idx - q->next_cons_idx;
801	else
802		return QUEUE_SIZE - q->next_cons_idx + q->next_prod_idx;
803}
804
805static __always_inline int queue_nr_free(const struct queue *q)
806{
807	int nr_free = QUEUE_SIZE - queue_len(q);
808
809	/*
810	 * If there is only one slot left we will anyway lie and claim that the
811	 * queue is full because adding an element will make it appear empty
812	 */
813	if (nr_free == 1)
814		nr_free = 0;
815	return nr_free;
816}
817
818static __always_inline void queue_idx_inc(int *idx)
819{
820	*idx = (*idx + 1) % QUEUE_SIZE;
821}
822
823static __always_inline void queue_push_to_back(struct queue *q,
824					      const struct entry *e)
825{
826	q->entries[q->next_prod_idx] = *e;
827	queue_idx_inc(&q->next_prod_idx);
828}
829
830static __always_inline struct entry queue_pop_from_front(struct queue *q)
831{
832	struct entry e = q->entries[q->next_cons_idx];
833
834	queue_idx_inc(&q->next_cons_idx);
835	return e;
836}
837
838static __always_inline void queue_cond_signal(struct queue *q)
839{
840	cond_signal(&q->cond);
841}
842
843static __always_inline void queue_cond_wait(struct queue *q)
844{
845	cond_wait(&q->cond, &q->mutex);
846}
847
848static __always_inline int queue_try_to_add_entry(struct queue *q,
849						  const struct entry *e)
850{
851	int r = 0;
852
853	mutex_lock(&q->mutex);
854	if (queue_nr_free(q) > 0) {
855		queue_push_to_back(q, e);
856		cond_signal(&q->cond);
857	} else
858		r = -1;
859	mutex_unlock(&q->mutex);
860	return r;
861}
862
863static struct entry queue_wait_for_entry(struct queue *q)
864{
865	struct entry e;
866
867	mutex_lock(&q->mutex);
868	while (true) {
869		if (queue_len(&printqueue) > 0) {
870			e = queue_pop_from_front(q);
871			break;
872		}
873		queue_cond_wait(q);
874	}
875	mutex_unlock(&q->mutex);
876
877	return e;
878}
879
880static const struct policy *policy_from_name(const char *name)
881{
882	const struct policy *p = &policies[0];
883
884	while (p->name != NULL) {
885		if (!strcmp(name, p->name))
886			return p;
887		p++;
888	}
889	return NULL;
890}
891
892static const char *policy_name(int policy)
893{
894	const struct policy *p = &policies[0];
895	static const char *rval = "unknown";
896
897	while (p->name != NULL) {
898		if (p->policy == policy)
899			return p->name;
900		p++;
901	}
902	return rval;
903}
904
905static bool is_relevant_tracer(const char *name)
906{
907	unsigned int i;
908
909	for (i = 0; relevant_tracers[i]; i++)
910		if (!strcmp(name, relevant_tracers[i]))
911			return true;
912	return false;
913}
914
915static bool random_makes_sense(const char *name)
916{
917	unsigned int i;
918
919	for (i = 0; random_tracers[i]; i++)
920		if (!strcmp(name, random_tracers[i]))
921			return true;
922	return false;
923}
924
925static void show_available(void)
926{
927	char **tracers;
928	int found = 0;
929	int i;
930
931	tracers = tracefs_tracers(NULL);
932	for (i = 0; tracers && tracers[i]; i++) {
933		if (is_relevant_tracer(tracers[i]))
934			found++;
935	}
936
937	if (!tracers) {
938		warnx(no_tracer_msg);
939		return;
940	}
941
942	if (!found) {
943		warnx(no_latency_tr_msg);
944		tracefs_list_free(tracers);
945		return;
946	}
947
948	printf("The following latency tracers are available on your system:\n");
949	for (i = 0; tracers[i]; i++) {
950		if (is_relevant_tracer(tracers[i]))
951			printf("%s\n", tracers[i]);
952	}
953	tracefs_list_free(tracers);
954}
955
956static bool tracer_valid(const char *name, bool *notracer)
957{
958	char **tracers;
959	int i;
960	bool rval = false;
961
962	*notracer = false;
963	tracers = tracefs_tracers(NULL);
964	if (!tracers) {
965		*notracer = true;
966		return false;
967	}
968	for (i = 0; tracers[i]; i++)
969		if (!strcmp(tracers[i], name)) {
970			rval = true;
971			break;
972		}
973	tracefs_list_free(tracers);
974	return rval;
975}
976
977static const char *find_default_tracer(void)
978{
979	int i;
980	bool notracer;
981	bool valid;
982
983	for (i = 0; relevant_tracers[i]; i++) {
984		valid = tracer_valid(relevant_tracers[i], &notracer);
985		if (notracer)
986			errx(EXIT_FAILURE, no_tracer_msg);
987		if (valid)
988			return relevant_tracers[i];
989	}
990	return NULL;
991}
992
993static bool toss_coin(struct drand48_data *buffer, unsigned int prob)
994{
995	long r;
996
997	if (unlikely(lrand48_r(buffer, &r))) {
998		warnx("lrand48_r() failed");
999		cleanup_exit(EXIT_FAILURE);
1000	}
1001	r = r % 1000L;
1002	if (r < prob)
1003		return true;
1004	else
1005		return false;
1006}
1007
1008
1009static long go_to_sleep(const struct entry *req)
1010{
1011	struct timespec future;
1012	long delay = sleep_time;
1013
1014	get_time_in_future(&future, delay);
1015
1016	mutex_lock(&printstate.mutex);
1017	while (!printstate_has_new_req_arrived(req)) {
1018		cond_timedwait(&printstate.cond, &printstate.mutex, &future);
1019		if (time_has_passed(&future))
1020			break;
1021	}
1022
1023	if (printstate_has_new_req_arrived(req))
1024		delay = -1;
1025	mutex_unlock(&printstate.mutex);
1026
1027	return delay;
1028}
1029
1030
1031static void set_priority(void)
1032{
1033	int r;
1034	pid_t pid;
1035	struct sched_param param;
1036
1037	memset(&param, 0, sizeof(param));
1038	param.sched_priority = sched_pri;
1039
1040	pid = getpid();
1041	r = sched_setscheduler(pid, sched_policy, &param);
1042
1043	if (r != 0)
1044		err(EXIT_FAILURE, "sched_setscheduler() failed");
1045}
1046
1047pid_t latency_collector_gettid(void)
1048{
1049	return (pid_t) syscall(__NR_gettid);
1050}
1051
1052static void print_priority(void)
1053{
1054	pid_t tid;
1055	int policy;
1056	int r;
1057	struct sched_param param;
1058
1059	tid = latency_collector_gettid();
1060	r = pthread_getschedparam(pthread_self(), &policy, &param);
1061	if (r != 0) {
1062		warn("pthread_getschedparam() failed");
1063		cleanup_exit(EXIT_FAILURE);
1064	}
1065	mutex_lock(&print_mtx);
1066	printf("Thread %d runs with scheduling policy %s and priority %d\n",
1067	       tid, policy_name(policy), param.sched_priority);
1068	mutex_unlock(&print_mtx);
1069}
1070
1071static __always_inline
1072void __print_skipmessage(const struct short_msg *resize_msg,
1073			 const struct timespec *timestamp, char *buffer,
1074			 size_t bufspace, const struct entry *req, bool excuse,
1075			 const char *str)
1076{
1077	ssize_t bytes = 0;
1078	char *p = &buffer[0];
1079	long us, sec;
1080	int r;
1081
1082	sec = timestamp->tv_sec;
1083	us = timestamp->tv_nsec / 1000;
1084
1085	if (resize_msg != NULL && resize_msg->len > 0) {
1086		strncpy(p, resize_msg->buf, resize_msg->len);
1087		bytes += resize_msg->len;
1088		p += resize_msg->len;
1089		bufspace -= resize_msg->len;
1090	}
1091
1092	if (excuse)
1093		r = snprintf(p, bufspace,
1094"%ld.%06ld Latency %d printout skipped due to %s\n",
1095			     sec, us, req->ticket, str);
1096	else
1097		r = snprintf(p, bufspace, "%ld.%06ld Latency %d detected\n",
1098			    sec, us, req->ticket);
1099
1100	if (r < 0)
1101		warn("snprintf() failed");
1102	else
1103		bytes += r;
1104
1105	/* These prints could happen concurrently */
1106	mutex_lock(&print_mtx);
1107	write_or_die(fd_stdout, buffer, bytes);
1108	mutex_unlock(&print_mtx);
1109}
1110
1111static void print_skipmessage(const struct short_msg *resize_msg,
1112			      const struct timespec *timestamp, char *buffer,
1113			      size_t bufspace, const struct entry *req,
1114			      bool excuse)
1115{
1116	__print_skipmessage(resize_msg, timestamp, buffer, bufspace, req,
1117			    excuse, "random delay");
1118}
1119
1120static void print_lostmessage(const struct timespec *timestamp, char *buffer,
1121			      size_t bufspace, const struct entry *req,
1122			      const char *reason)
1123{
1124	__print_skipmessage(NULL, timestamp, buffer, bufspace, req, true,
1125			    reason);
1126}
1127
1128static void print_tracefile(const struct short_msg *resize_msg,
1129			    const struct timespec *timestamp, char *buffer,
1130			    size_t bufspace, long slept,
1131			    const struct entry *req)
1132{
1133	static const int reserve = 256;
1134	char *p = &buffer[0];
1135	ssize_t bytes = 0;
1136	ssize_t bytes_tot = 0;
1137	long us, sec;
1138	long slept_ms;
1139	int trace_fd;
1140
1141	/* Save some space for the final string and final null char */
1142	bufspace = bufspace - reserve - 1;
1143
1144	if (resize_msg != NULL && resize_msg->len > 0) {
1145		bytes = resize_msg->len;
1146		strncpy(p, resize_msg->buf, bytes);
1147		bytes_tot += bytes;
1148		p += bytes;
1149		bufspace -= bytes;
1150	}
1151
1152	trace_fd = open(debug_tracefile, O_RDONLY);
1153
1154	if (trace_fd < 0) {
1155		warn("open() failed on %s", debug_tracefile);
1156		return;
1157	}
1158
1159	sec = timestamp->tv_sec;
1160	us = timestamp->tv_nsec / 1000;
1161
1162	if (slept != 0) {
1163		slept_ms = slept / 1000;
1164		bytes = snprintf(p, bufspace,
1165"%ld.%06ld Latency %d randomly sleep for %ld ms before print\n",
1166				 sec, us, req->ticket, slept_ms);
1167	} else {
1168		bytes = snprintf(p, bufspace,
1169				 "%ld.%06ld Latency %d immediate print\n", sec,
1170				 us, req->ticket);
1171	}
1172
1173	if (bytes < 0) {
1174		warn("snprintf() failed");
1175		return;
1176	}
1177	p += bytes;
1178	bufspace -= bytes;
1179	bytes_tot += bytes;
1180
1181	bytes = snprintf(p, bufspace,
1182">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BEGIN <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"
1183		);
1184
1185	if (bytes < 0) {
1186		warn("snprintf() failed");
1187		return;
1188	}
1189
1190	p += bytes;
1191	bufspace -= bytes;
1192	bytes_tot += bytes;
1193
1194	do {
1195		bytes = read(trace_fd, p, bufspace);
1196		if (bytes < 0) {
1197			if (errno == EINTR)
1198				continue;
1199			warn("read() failed on %s", debug_tracefile);
1200			if (unlikely(close(trace_fd) != 0))
1201				warn("close() failed on %s", debug_tracefile);
1202			return;
1203		}
1204		if (bytes == 0)
1205			break;
1206		p += bytes;
1207		bufspace -= bytes;
1208		bytes_tot += bytes;
1209	} while (true);
1210
1211	if (unlikely(close(trace_fd) != 0))
1212		warn("close() failed on %s", debug_tracefile);
1213
1214	printstate_cnt_dec();
1215	/* Add the reserve space back to the budget for the final string */
1216	bufspace += reserve;
1217
1218	bytes = snprintf(p, bufspace,
1219			 ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> END <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n\n");
1220
1221	if (bytes < 0) {
1222		warn("snprintf() failed");
1223		return;
1224	}
1225
1226	bytes_tot += bytes;
1227
1228	/* These prints could happen concurrently */
1229	mutex_lock(&print_mtx);
1230	write_or_die(fd_stdout, buffer, bytes_tot);
1231	mutex_unlock(&print_mtx);
1232}
1233
1234static char *get_no_opt(const char *opt)
1235{
1236	char *no_opt;
1237	int s;
1238
1239	s = strlen(opt) + strlen(OPT_NO_PREFIX) + 1;
1240	/* We may be called from cleanup_exit() via set_trace_opt() */
1241	no_opt = malloc_or_die_nocleanup(s);
1242	strcpy(no_opt, OPT_NO_PREFIX);
1243	strcat(no_opt, opt);
1244	return no_opt;
1245}
1246
1247static char *find_next_optstr(const char *allopt, const char **next)
1248{
1249	const char *begin;
1250	const char *end;
1251	char *r;
1252	int s = 0;
1253
1254	if (allopt == NULL)
1255		return NULL;
1256
1257	for (begin = allopt; *begin != '\0'; begin++) {
1258		if (isgraph(*begin))
1259			break;
1260	}
1261
1262	if (*begin == '\0')
1263		return NULL;
1264
1265	for (end = begin; *end != '\0' && isgraph(*end); end++)
1266		s++;
1267
1268	r = malloc_or_die_nocleanup(s + 1);
1269	strncpy(r, begin, s);
1270	r[s] = '\0';
1271	*next = begin + s;
1272	return r;
1273}
1274
1275static bool get_trace_opt(const char *allopt, const char *opt, bool *found)
1276{
1277	*found = false;
1278	char *no_opt;
1279	char *str;
1280	const char *next = allopt;
1281	bool rval = false;
1282
1283	no_opt = get_no_opt(opt);
1284
1285	do {
1286		str = find_next_optstr(next, &next);
1287		if (str == NULL)
1288			break;
1289		if (!strcmp(str, opt)) {
1290			*found = true;
1291			rval = true;
1292			free(str);
1293			break;
1294		}
1295		if (!strcmp(str, no_opt)) {
1296			*found = true;
1297			rval = false;
1298			free(str);
1299			break;
1300		}
1301		free(str);
1302	} while (true);
1303	free(no_opt);
1304
1305	return rval;
1306}
1307
1308static int set_trace_opt(const char *opt, bool value)
1309{
1310	char *str;
1311	int r;
1312
1313	if (value)
1314		str = strdup(opt);
1315	else
1316		str = get_no_opt(opt);
1317
1318	r = tracefs_instance_file_write(NULL, TR_OPTIONS, str);
1319	free(str);
1320	return r;
1321}
1322
1323void save_trace_opts(struct ftrace_state *state)
1324{
1325	char *allopt;
1326	int psize;
1327	int i;
1328
1329	allopt = tracefs_instance_file_read(NULL, TR_OPTIONS, &psize);
1330	if (!allopt)
1331		errx(EXIT_FAILURE, "Failed to read the %s file\n", TR_OPTIONS);
1332
1333	for (i = 0; i < OPTIDX_NR; i++)
1334		state->opt[i] = get_trace_opt(allopt, optstr[i],
1335					      &state->opt_valid[i]);
1336
1337	free(allopt);
1338}
1339
1340static void write_file(const char *file, const char *cur, const char *new,
1341		       enum errhandling h)
1342{
1343	int r;
1344	static const char *emsg = "Failed to write to the %s file!";
1345
1346	/* Do nothing if we now that the current and new value are equal */
1347	if (cur && !needs_change(cur, new))
1348		return;
1349
1350	r = tracefs_instance_file_write(NULL, file, new);
1351	if (r < 0) {
1352		if (h) {
1353			warnx(emsg, file);
1354			if (h == ERR_CLEANUP)
1355				cleanup_exit(EXIT_FAILURE);
1356		} else
1357			errx(EXIT_FAILURE, emsg, file);
1358	}
1359	if (verbose_ftrace()) {
1360		mutex_lock(&print_mtx);
1361		printf("%s was set to %s\n", file, new);
1362		mutex_unlock(&print_mtx);
1363	}
1364}
1365
1366static void reset_max_latency(void)
1367{
1368	write_file(TR_MAXLAT, NULL, "0", ERR_CLEANUP);
1369}
1370
1371static void save_and_disable_tracer(void)
1372{
1373	char *orig_th;
1374	char *tracer;
1375	bool need_nop = false;
1376
1377	mutex_lock(&save_state.mutex);
1378
1379	save_trace_opts(&save_state);
1380	tracer = read_file(TR_CURRENT, ERR_EXIT);
1381	orig_th = read_file(TR_THRESH, ERR_EXIT);
1382
1383	if (needs_change(tracer, NOP_TRACER)) {
1384		mutex_lock(&print_mtx);
1385		if (force_tracer) {
1386			printf(
1387				"The %s tracer is already in use but proceeding anyway!\n",
1388				tracer);
1389		} else {
1390			printf(
1391				"The %s tracer is already in use, cowardly bailing out!\n"
1392				"This could indicate that another program or instance is tracing.\n"
1393				"Use the -F [--force] option to disregard the current tracer.\n", tracer);
1394			exit(0);
1395		}
1396		mutex_unlock(&print_mtx);
1397		need_nop = true;
1398	}
1399
1400	save_state.tracer =  tracer;
1401	save_state.thresh = orig_th;
1402
1403	if (need_nop)
1404		write_file(TR_CURRENT, NULL, NOP_TRACER, ERR_EXIT);
1405
1406	mutex_unlock(&save_state.mutex);
1407}
1408
1409void set_trace_opts(struct ftrace_state *state, bool *new)
1410{
1411	int i;
1412	int r;
1413
1414	/*
1415	 * We only set options if we earlier detected that the option exists in
1416	 * the trace_options file and that the wanted setting is different from
1417	 * the one we saw in save_and_disable_tracer()
1418	 */
1419	for (i = 0; i < OPTIDX_NR; i++)
1420		if (state->opt_valid[i] &&
1421		    state->opt[i] != new[i]) {
1422			r = set_trace_opt(optstr[i], new[i]);
1423			if (r < 0) {
1424				warnx("Failed to set the %s option to %s",
1425				      optstr[i], bool2str(new[i]));
1426				cleanup_exit(EXIT_FAILURE);
1427			}
1428			if (verbose_ftrace()) {
1429				mutex_lock(&print_mtx);
1430				printf("%s in %s was set to %s\n", optstr[i],
1431				       TR_OPTIONS, bool2str(new[i]));
1432				mutex_unlock(&print_mtx);
1433			}
1434		}
1435}
1436
1437static void enable_tracer(void)
1438{
1439	mutex_lock(&save_state.mutex);
1440	set_trace_opts(&save_state, use_options);
1441
1442	write_file(TR_THRESH, save_state.thresh, threshold, ERR_CLEANUP);
1443	write_file(TR_CURRENT, NOP_TRACER, current_tracer, ERR_CLEANUP);
1444
1445	mutex_unlock(&save_state.mutex);
1446}
1447
1448static void tracing_loop(void)
1449{
1450	int ifd = inotify_init();
1451	int wd;
1452	const ssize_t bufsize = sizeof(inotify_buffer);
1453	const ssize_t istructsize = sizeof(struct inotify_event);
1454	char *buf = &inotify_buffer[0];
1455	ssize_t nr_read;
1456	char *p;
1457	int modified;
1458	struct inotify_event *event;
1459	struct entry req;
1460	char *buffer;
1461	const size_t bufspace = PRINT_BUFFER_SIZE;
1462	struct timespec timestamp;
1463
1464	print_priority();
1465
1466	buffer = malloc_or_die(bufspace);
1467
1468	if (ifd < 0)
1469		err(EXIT_FAILURE, "inotify_init() failed!");
1470
1471
1472	if (setup_ftrace) {
1473		/*
1474		 * We must disable the tracer before resetting the max_latency
1475		 */
1476		save_and_disable_tracer();
1477		/*
1478		 * We must reset the max_latency before the inotify_add_watch()
1479		 * call.
1480		 */
1481		reset_max_latency();
1482	}
1483
1484	wd = inotify_add_watch(ifd, debug_maxlat, IN_MODIFY);
1485	if (wd < 0)
1486		err(EXIT_FAILURE, "inotify_add_watch() failed!");
1487
1488	if (setup_ftrace)
1489		enable_tracer();
1490
1491	signal_blocking(SIG_UNBLOCK);
1492
1493	while (true) {
1494		modified = 0;
1495		check_signals();
1496		nr_read = read(ifd, buf, bufsize);
1497		check_signals();
1498		if (nr_read < 0) {
1499			if (errno == EINTR)
1500				continue;
1501			warn("read() failed on inotify fd!");
1502			cleanup_exit(EXIT_FAILURE);
1503		}
1504		if (nr_read == bufsize)
1505			warnx("inotify() buffer filled, skipping events");
1506		if (nr_read < istructsize) {
1507			warnx("read() returned too few bytes on inotify fd");
1508			cleanup_exit(EXIT_FAILURE);
1509		}
1510
1511		for (p = buf; p < buf + nr_read;) {
1512			event = (struct inotify_event *) p;
1513			if ((event->mask & IN_MODIFY) != 0)
1514				modified++;
1515			p += istructsize + event->len;
1516		}
1517		while (modified > 0) {
1518			check_signals();
1519			mutex_lock(&printstate.mutex);
1520			check_signals();
1521			printstate_next_ticket(&req);
1522			if (printstate_cnt_read() > 0) {
1523				printstate_mark_req_completed(&req);
1524				mutex_unlock(&printstate.mutex);
1525				if (verbose_lostevent()) {
1526					clock_gettime_or_die(CLOCK_MONOTONIC,
1527							     &timestamp);
1528					print_lostmessage(&timestamp, buffer,
1529							  bufspace, &req,
1530							  "inotify loop");
1531				}
1532				break;
1533			}
1534			mutex_unlock(&printstate.mutex);
1535			if (queue_try_to_add_entry(&printqueue, &req) != 0) {
1536				/* These prints could happen concurrently */
1537				check_signals();
1538				mutex_lock(&print_mtx);
1539				check_signals();
1540				write_or_die(fd_stdout, queue_full_warning,
1541					     strlen(queue_full_warning));
1542				mutex_unlock(&print_mtx);
1543			}
1544			modified--;
1545		}
1546	}
1547}
1548
1549static void *do_printloop(void *arg)
1550{
1551	const size_t bufspace = PRINT_BUFFER_SIZE;
1552	char *buffer;
1553	long *rseed = (long *) arg;
1554	struct drand48_data drandbuf;
1555	long slept = 0;
1556	struct entry req;
1557	int prob = 0;
1558	struct timespec timestamp;
1559	struct short_msg resize_msg;
1560
1561	print_priority();
1562
1563	if (srand48_r(*rseed, &drandbuf) != 0) {
1564		warn("srand48_r() failed!\n");
1565		cleanup_exit(EXIT_FAILURE);
1566	}
1567
1568	buffer = malloc_or_die(bufspace);
1569
1570	while (true) {
1571		req = queue_wait_for_entry(&printqueue);
1572		clock_gettime_or_die(CLOCK_MONOTONIC, &timestamp);
1573		mutex_lock(&printstate.mutex);
1574		if (prev_req_won_race(&req)) {
1575			printstate_mark_req_completed(&req);
1576			mutex_unlock(&printstate.mutex);
1577			if (verbose_lostevent())
1578				print_lostmessage(&timestamp, buffer, bufspace,
1579						  &req, "print loop");
1580			continue;
1581		}
1582		mutex_unlock(&printstate.mutex);
1583
1584		/*
1585		 * Toss a coin to decide if we want to sleep before printing
1586		 * out the backtrace. The reason for this is that opening
1587		 * /sys/kernel/tracing/trace will cause a blackout of
1588		 * hundreds of ms, where no latencies will be noted by the
1589		 * latency tracer. Thus by randomly sleeping we try to avoid
1590		 * missing traces systematically due to this. With this option
1591		 * we will sometimes get the first latency, some other times
1592		 * some of the later ones, in case of closely spaced traces.
1593		 */
1594		if (trace_enable && use_random_sleep) {
1595			slept = 0;
1596			prob = table_get_probability(&req, &resize_msg);
1597			if (!toss_coin(&drandbuf, prob))
1598				slept = go_to_sleep(&req);
1599			if (slept >= 0) {
1600				/* A print is ongoing */
1601				printstate_cnt_inc();
1602				/*
1603				 * We will do the printout below so we have to
1604				 * mark it as completed while we still have the
1605				 * mutex.
1606				 */
1607				mutex_lock(&printstate.mutex);
1608				printstate_mark_req_completed(&req);
1609				mutex_unlock(&printstate.mutex);
1610			}
1611		}
1612		if (trace_enable) {
1613			/*
1614			 * slept < 0  means that we detected another
1615			 * notification in go_to_sleep() above
1616			 */
1617			if (slept >= 0)
1618				/*
1619				 * N.B. printstate_cnt_dec(); will be called
1620				 * inside print_tracefile()
1621				 */
1622				print_tracefile(&resize_msg, &timestamp, buffer,
1623						bufspace, slept, &req);
1624			else
1625				print_skipmessage(&resize_msg, &timestamp,
1626						  buffer, bufspace, &req, true);
1627		} else {
1628			print_skipmessage(&resize_msg, &timestamp, buffer,
1629					  bufspace, &req, false);
1630		}
1631	}
1632	return NULL;
1633}
1634
1635static void start_printthread(void)
1636{
1637	unsigned int i;
1638	long *seed;
1639	int ufd;
1640
1641	ufd = open(DEV_URANDOM, O_RDONLY);
1642	if (nr_threads > MAX_THREADS) {
1643		warnx(
1644"Number of requested print threads was %d, max number is %d\n",
1645		      nr_threads, MAX_THREADS);
1646		nr_threads = MAX_THREADS;
1647	}
1648	for (i = 0; i < nr_threads; i++) {
1649		seed = malloc_or_die(sizeof(*seed));
1650		if (ufd <  0 ||
1651		    read(ufd, seed, sizeof(*seed)) != sizeof(*seed)) {
1652			printf(
1653"Warning! Using trivial random number seed, since %s not available\n",
1654			DEV_URANDOM);
1655			fflush(stdout);
1656			*seed = i;
1657		}
1658		errno = pthread_create(&printthread[i], NULL, do_printloop,
1659				       seed);
1660		if (errno)
1661			err(EXIT_FAILURE, "pthread_create()");
1662	}
1663	if (ufd > 0 && close(ufd) != 0)
1664		warn("close() failed");
1665}
1666
1667static void show_usage(void)
1668{
1669	printf(
1670"Usage: %s [OPTION]...\n\n"
1671"Collect closely occurring latencies from %s\n"
1672"with any of the following tracers: preemptirqsoff, preemptoff, irqsoff, "
1673"wakeup,\nwakeup_dl, or wakeup_rt.\n\n"
1674
1675"The occurrence of a latency is detected by monitoring the file\n"
1676"%s with inotify.\n\n"
1677
1678"The following options are supported:\n\n"
1679
1680"-l, --list\t\tList the latency tracers that are supported by the\n"
1681"\t\t\tcurrently running Linux kernel. If you don't see the\n"
1682"\t\t\ttracer that you want, you will probably need to\n"
1683"\t\t\tchange your kernel config and build a new kernel.\n\n"
1684
1685"-t, --tracer TR\t\tUse the tracer TR. The default is to use the first\n"
1686"\t\t\ttracer that is supported by the kernel in the following\n"
1687"\t\t\torder of precedence:\n\n"
1688"\t\t\tpreemptirqsoff\n"
1689"\t\t\tpreemptoff\n"
1690"\t\t\tirqsoff\n"
1691"\t\t\twakeup\n"
1692"\t\t\twakeup_rt\n"
1693"\t\t\twakeup_dl\n"
1694"\n"
1695"\t\t\tIf TR is not on the list above, then a warning will be\n"
1696"\t\t\tprinted.\n\n"
1697
1698"-F, --force\t\tProceed even if another ftrace tracer is active. Without\n"
1699"\t\t\tthis option, the program will refuse to start tracing if\n"
1700"\t\t\tany other tracer than the nop tracer is active.\n\n"
1701
1702"-s, --threshold TH\tConfigure ftrace to use a threshold of TH microseconds\n"
1703"\t\t\tfor the tracer. The default is 0, which means that\n"
1704"\t\t\ttracing_max_latency will be used. tracing_max_latency is\n"
1705"\t\t\tset to 0 when the program is started and contains the\n"
1706"\t\t\tmaximum of the latencies that have been encountered.\n\n"
1707
1708"-f, --function\t\tEnable the function-trace option in trace_options. With\n"
1709"\t\t\tthis option, ftrace will trace the functions that are\n"
1710"\t\t\texecuted during a latency, without it we only get the\n"
1711"\t\t\tbeginning, end, and backtrace.\n\n"
1712
1713"-g, --graph\t\tEnable the display-graph option in trace_option. This\n"
1714"\t\t\toption causes ftrace to show the graph of how functions\n"
1715"\t\t\tare calling other functions.\n\n"
1716
1717"-c, --policy POL\tRun the program with scheduling policy POL. POL can be\n"
1718"\t\t\tother, batch, idle, rr or fifo. The default is rr. When\n"
1719"\t\t\tusing rr or fifo, remember that these policies may cause\n"
1720"\t\t\tother tasks to experience latencies.\n\n"
1721
1722"-p, --priority PRI\tRun the program with priority PRI. The acceptable range\n"
1723"\t\t\tof PRI depends on the scheduling policy.\n\n"
1724
1725"-n, --notrace\t\tIf latency is detected, do not print out the content of\n"
1726"\t\t\tthe trace file to standard output\n\n"
1727
1728"-t, --threads NRTHR\tRun NRTHR threads for printing. Default is %d.\n\n"
1729
1730"-r, --random\t\tArbitrarily sleep a certain amount of time, default\n"
1731"\t\t\t%ld ms, before reading the trace file. The\n"
1732"\t\t\tprobabilities for sleep are chosen so that the\n"
1733"\t\t\tprobability of obtaining any of a cluster of closely\n"
1734"\t\t\toccurring latencies are equal, i.e. we will randomly\n"
1735"\t\t\tchoose which one we collect from the trace file.\n\n"
1736"\t\t\tThis option is probably only useful with the irqsoff,\n"
1737"\t\t\tpreemptoff, and preemptirqsoff tracers.\n\n"
1738
1739"-a, --nrlat NRLAT\tFor the purpose of arbitrary delay, assume that there\n"
1740"\t\t\tare no more than NRLAT clustered latencies. If NRLAT\n"
1741"\t\t\tlatencies are detected during a run, this value will\n"
1742"\t\t\tautomatically be increased to NRLAT + 1 and then to\n"
1743"\t\t\tNRLAT + 2 and so on. The default is %d. This option\n"
1744"\t\t\timplies -r. We need to know this number in order to\n"
1745"\t\t\tbe able to calculate the probabilities of sleeping.\n"
1746"\t\t\tSpecifically, the probabilities of not sleeping, i.e. to\n"
1747"\t\t\tdo an immediate printout will be:\n\n"
1748"\t\t\t1/NRLAT  1/(NRLAT - 1) ... 1/3  1/2  1\n\n"
1749"\t\t\tThe probability of sleeping will be:\n\n"
1750"\t\t\t1 - P, where P is from the series above\n\n"
1751"\t\t\tThis descending probability will cause us to choose\n"
1752"\t\t\tan occurrence at random. Observe that the final\n"
1753"\t\t\tprobability is 0, it is when we reach this probability\n"
1754"\t\t\tthat we increase NRLAT automatically. As an example,\n"
1755"\t\t\twith the default value of 2, the probabilities will be:\n\n"
1756"\t\t\t1/2  0\n\n"
1757"\t\t\tThis means, when a latency is detected we will sleep\n"
1758"\t\t\twith 50%% probability. If we ever detect another latency\n"
1759"\t\t\tduring the sleep period, then the probability of sleep\n"
1760"\t\t\twill be 0%% and the table will be expanded to:\n\n"
1761"\t\t\t1/3  1/2  0\n\n"
1762
1763"-v, --verbose\t\tIncrease the verbosity. If this option is given once,\n"
1764"\t\t\tthen print a message every time that the NRLAT value\n"
1765"\t\t\tis automatically increased. It also causes a message to\n"
1766"\t\t\tbe printed when the ftrace settings are changed. If this\n"
1767"\t\t\toption is given at least twice, then also print a\n"
1768"\t\t\twarning for lost events.\n\n"
1769
1770"-u, --time TIME\t\tArbitrarily sleep for a specified time TIME ms before\n"
1771"\t\t\tprinting out the trace from the trace file. The default\n"
1772"\t\t\tis %ld ms. This option implies -r.\n\n"
1773
1774"-x, --no-ftrace\t\tDo not configure ftrace. This assume that the user\n"
1775"\t\t\tconfigures the ftrace files in sysfs such as\n"
1776"\t\t\t/sys/kernel/tracing/current_tracer or equivalent.\n\n"
1777
1778"-i, --tracefile FILE\tUse FILE as trace file. The default is\n"
1779"\t\t\t%s.\n"
1780"\t\t\tThis options implies -x\n\n"
1781
1782"-m, --max-lat FILE\tUse FILE as tracing_max_latency file. The default is\n"
1783"\t\t\t%s.\n"
1784"\t\t\tThis options implies -x\n\n"
1785,
1786prg_name, debug_tracefile_dflt, debug_maxlat_dflt, DEFAULT_NR_PRINTER_THREADS,
1787SLEEP_TIME_MS_DEFAULT, DEFAULT_TABLE_SIZE, SLEEP_TIME_MS_DEFAULT,
1788debug_tracefile_dflt, debug_maxlat_dflt);
1789}
1790
1791static void find_tracefiles(void)
1792{
1793	debug_tracefile_dflt = tracefs_get_tracing_file("trace");
1794	if (debug_tracefile_dflt == NULL) {
1795		/* This is needed in show_usage() */
1796		debug_tracefile_dflt = DEBUG_NOFILE;
1797	}
1798
1799	debug_maxlat_dflt = tracefs_get_tracing_file("tracing_max_latency");
1800	if (debug_maxlat_dflt == NULL) {
1801		/* This is needed in show_usage() */
1802		debug_maxlat_dflt = DEBUG_NOFILE;
1803	}
1804
1805	debug_tracefile = debug_tracefile_dflt;
1806	debug_maxlat = debug_maxlat_dflt;
1807}
1808
1809bool alldigits(const char *s)
1810{
1811	for (; *s != '\0'; s++)
1812		if (!isdigit(*s))
1813			return false;
1814	return true;
1815}
1816
1817void check_alldigits(const char *optarg, const char *argname)
1818{
1819	if (!alldigits(optarg))
1820		errx(EXIT_FAILURE,
1821		     "The %s parameter expects a decimal argument\n", argname);
1822}
1823
1824static void scan_arguments(int argc, char *argv[])
1825{
1826	int c;
1827	int i;
1828	int option_idx = 0;
1829
1830	static struct option long_options[] = {
1831		{ "list",       no_argument,            0, 'l' },
1832		{ "tracer",	required_argument,	0, 't' },
1833		{ "force",      no_argument,            0, 'F' },
1834		{ "threshold",  required_argument,      0, 's' },
1835		{ "function",   no_argument,            0, 'f' },
1836		{ "graph",      no_argument,            0, 'g' },
1837		{ "policy",	required_argument,	0, 'c' },
1838		{ "priority",	required_argument,	0, 'p' },
1839		{ "help",	no_argument,		0, 'h' },
1840		{ "notrace",	no_argument,		0, 'n' },
1841		{ "random",	no_argument,		0, 'r' },
1842		{ "nrlat",	required_argument,	0, 'a' },
1843		{ "threads",	required_argument,	0, 'e' },
1844		{ "time",	required_argument,	0, 'u' },
1845		{ "verbose",	no_argument,		0, 'v' },
1846		{ "no-ftrace",  no_argument,            0, 'x' },
1847		{ "tracefile",	required_argument,	0, 'i' },
1848		{ "max-lat",	required_argument,	0, 'm' },
1849		{ 0,		0,			0,  0  }
1850	};
1851	const struct policy *p;
1852	int max, min;
1853	int value;
1854	bool notracer, valid;
1855
1856	/*
1857	 * We must do this before parsing the arguments because show_usage()
1858	 * needs to display these.
1859	 */
1860	find_tracefiles();
1861
1862	while (true) {
1863		c = getopt_long(argc, argv, "lt:Fs:fgc:p:hnra:e:u:vxi:m:",
1864				long_options, &option_idx);
1865		if (c == -1)
1866			break;
1867
1868		switch (c) {
1869		case 'l':
1870			show_available();
1871			exit(0);
1872			break;
1873		case 't':
1874			current_tracer = strdup(optarg);
1875			if (!is_relevant_tracer(current_tracer)) {
1876				warnx("%s is not a known latency tracer!\n",
1877				      current_tracer);
1878			}
1879			valid = tracer_valid(current_tracer, &notracer);
1880			if (notracer)
1881				errx(EXIT_FAILURE, no_tracer_msg);
1882			if (!valid)
1883				errx(EXIT_FAILURE,
1884"The tracer %s is not supported by your kernel!\n", current_tracer);
1885			break;
1886		case 'F':
1887			force_tracer = true;
1888			break;
1889		case 's':
1890			check_alldigits(optarg, "-s [--threshold]");
1891			threshold = strdup(optarg);
1892			break;
1893		case 'f':
1894			use_options[OPTIDX_FUNC_TR] = true;
1895			break;
1896		case 'g':
1897			use_options[OPTIDX_DISP_GR] = true;
1898			break;
1899		case 'c':
1900			p = policy_from_name(optarg);
1901			if (p != NULL) {
1902				sched_policy = p->policy;
1903				sched_policy_set = true;
1904				if (!sched_pri_set) {
1905					sched_pri = p->default_pri;
1906					sched_pri_set = true;
1907				}
1908			} else {
1909				warnx("Unknown scheduling %s\n", optarg);
1910				show_usage();
1911				exit(0);
1912			}
1913			break;
1914		case 'p':
1915			check_alldigits(optarg, "-p [--priority]");
1916			sched_pri = atoi(optarg);
1917			sched_pri_set = true;
1918			break;
1919		case 'h':
1920			show_usage();
1921			exit(0);
1922			break;
1923		case 'n':
1924			trace_enable = false;
1925			use_random_sleep = false;
1926			break;
1927		case 'e':
1928			check_alldigits(optarg, "-e [--threads]");
1929			value = atoi(optarg);
1930			if (value > 0)
1931				nr_threads = value;
1932			else {
1933				warnx("NRTHR must be > 0\n");
1934				show_usage();
1935				exit(0);
1936			}
1937			break;
1938		case 'u':
1939			check_alldigits(optarg, "-u [--time]");
1940			value = atoi(optarg);
1941			if (value < 0) {
1942				warnx("TIME must be >= 0\n");
1943				show_usage();
1944				exit(0);
1945			}
1946			trace_enable = true;
1947			use_random_sleep = true;
1948			sleep_time = value * USEC_PER_MSEC;
1949			break;
1950		case 'v':
1951			verbosity++;
1952			break;
1953		case 'r':
1954			trace_enable = true;
1955			use_random_sleep = true;
1956			break;
1957		case 'a':
1958			check_alldigits(optarg, "-a [--nrlat]");
1959			value = atoi(optarg);
1960			if (value <= 0) {
1961				warnx("NRLAT must be > 0\n");
1962				show_usage();
1963				exit(0);
1964			}
1965			trace_enable = true;
1966			use_random_sleep = true;
1967			table_startsize = value;
1968			break;
1969		case 'x':
1970			setup_ftrace = false;
1971			break;
1972		case 'i':
1973			setup_ftrace = false;
1974			debug_tracefile = strdup(optarg);
1975			break;
1976		case 'm':
1977			setup_ftrace = false;
1978			debug_maxlat = strdup(optarg);
1979			break;
1980		default:
1981			show_usage();
1982			exit(0);
1983			break;
1984		}
1985	}
1986
1987	if (setup_ftrace) {
1988		if (!current_tracer) {
1989			current_tracer = find_default_tracer();
1990			if (!current_tracer)
1991				errx(EXIT_FAILURE,
1992"No default tracer found and tracer not specified\n");
1993		}
1994
1995		if (use_random_sleep && !random_makes_sense(current_tracer)) {
1996			warnx("WARNING: The tracer is %s and random sleep has",
1997			      current_tracer);
1998			fprintf(stderr,
1999"been enabled. Random sleep is intended for the following tracers:\n");
2000			for (i = 0; random_tracers[i]; i++)
2001				fprintf(stderr, "%s\n", random_tracers[i]);
2002			fprintf(stderr, "\n");
2003		}
2004	}
2005
2006	if (debug_tracefile == DEBUG_NOFILE ||
2007	    debug_maxlat == DEBUG_NOFILE)
2008		errx(EXIT_FAILURE,
2009"Could not find tracing directory e.g. /sys/kernel/tracing\n");
2010
2011	if (!sched_policy_set) {
2012		sched_policy = SCHED_RR;
2013		sched_policy_set = true;
2014		if (!sched_pri_set) {
2015			sched_pri = RT_DEFAULT_PRI;
2016			sched_pri_set = true;
2017		}
2018	}
2019
2020	max = sched_get_priority_max(sched_policy);
2021	min = sched_get_priority_min(sched_policy);
2022
2023	if (sched_pri < min) {
2024		printf(
2025"ATTENTION: Increasing priority to minimum, which is %d\n", min);
2026		sched_pri = min;
2027	}
2028	if (sched_pri > max) {
2029		printf(
2030"ATTENTION: Reducing priority to maximum, which is %d\n", max);
2031		sched_pri = max;
2032	}
2033}
2034
2035static void show_params(void)
2036{
2037	printf(
2038"\n"
2039"Running with scheduling policy %s and priority %d. Using %d print threads.\n",
2040		policy_name(sched_policy), sched_pri, nr_threads);
2041	if (trace_enable) {
2042		if (use_random_sleep) {
2043			printf(
2044"%s will be printed with random delay\n"
2045"Start size of the probability table:\t\t\t%d\n"
2046"Print a message when the prob. table changes size:\t%s\n"
2047"Print a warning when an event has been lost:\t\t%s\n"
2048"Sleep time is:\t\t\t\t\t\t%ld ms\n",
2049debug_tracefile,
2050table_startsize,
2051bool2str(verbose_sizechange()),
2052bool2str(verbose_lostevent()),
2053sleep_time / USEC_PER_MSEC);
2054		} else {
2055			printf("%s will be printed immediately\n",
2056			       debug_tracefile);
2057		}
2058	} else {
2059		printf("%s will not be printed\n",
2060		       debug_tracefile);
2061	}
2062	if (setup_ftrace) {
2063		printf("Tracer:\t\t\t\t\t\t\t%s\n"
2064		       "%s option:\t\t\t\t\t%s\n"
2065		       "%s option:\t\t\t\t\t%s\n",
2066		       current_tracer,
2067		       optstr[OPTIDX_FUNC_TR],
2068		       bool2str(use_options[OPTIDX_FUNC_TR]),
2069		       optstr[OPTIDX_DISP_GR],
2070		       bool2str(use_options[OPTIDX_DISP_GR]));
2071		if (!strcmp(threshold, "0"))
2072			printf("Threshold:\t\t\t\t\t\ttracing_max_latency\n");
2073		else
2074			printf("Threshold:\t\t\t\t\t\t%s\n", threshold);
2075	}
2076	printf("\n");
2077}
2078
2079int main(int argc, char *argv[])
2080{
2081	init_save_state();
2082	signal_blocking(SIG_BLOCK);
2083	setup_sig_handler();
2084	open_stdout();
2085
2086	if (argc >= 1)
2087		prg_name = argv[0];
2088	else
2089		prg_name = prg_unknown;
2090
2091	scan_arguments(argc, argv);
2092	show_params();
2093
2094	init_printstate();
2095	init_print_mtx();
2096	if (use_random_sleep) {
2097		init_probabilities();
2098		if (verbose_sizechange())
2099			printf("Initializing probability table to %d\n",
2100			       table_startsize);
2101		sleeptable_resize(table_startsize, false, NULL);
2102	}
2103	set_priority();
2104	init_queue(&printqueue);
2105	start_printthread();
2106	tracing_loop();
2107	return 0;
2108}
2109