1// Copyright 2018 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "watchdog.h"
6
7#include <assert.h>
8#include <errno.h>
9#include <inttypes.h>
10#include <limits.h>
11#include <pthread.h>
12#include <string.h>
13#include <time.h>
14
15#include <unittest/unittest.h>
16
17constexpr int WATCHDOG_ERRCODE = 5;
18
19constexpr uint64_t NANOSECONDS_PER_SECOND = 1000 * 1000 * 1000;
20
21// The watchdog thread wakes up after this many seconds to check whether
22// a test has timed out. The lower the number this is the more accurate
23// the watchdog is with regard to the specified timeout. But there's
24// no point in running too frequently. The wait mechanism we use is
25// interruptible, so this value can be high and there's no worries of waiting
26// for the watchdog to terminate. The watchdog works this way so that we
27// can have one watchdog thread that is continuously running instead of
28// starting a new watchdog thread for each test. Another goal is to not
29// require any synchronization between the watchdog thread and the test.
30// E.g., We don't want to have to wait for the watchdog to acknowledge that
31// a test is starting and stopping. Instead we just let it run at its own pace.
32// Tests often complete in milliseconds, far lower than our "tick".
33constexpr int WATCHDOG_TICK_SECONDS = 1;
34
35// Value stored in |active_timeout_seconds| to indicate test is not running.
36constexpr int WATCHDOG_TIMEOUT_NOT_RUNNING = INT_MAX;
37
38// This can be overridden by the user by setting env var WATCHDOG_ENV_NAME.
39static int base_timeout_seconds = DEFAULT_BASE_TIMEOUT_SECONDS;
40
41static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
42
43// The name of the current test.
44// Used to report which test timed out.
45static const char* test_name; //TA_GUARDED(mutex)
46
47// The current timeout in effect.
48// When tests aren't running we set this to INT_MAX.
49static int active_timeout_seconds = WATCHDOG_TIMEOUT_NOT_RUNNING; //TA_GUARDED(mutex)
50
51// The time when the test was started.
52// This is the result of clock_gettime converted to nanoseconds.
53static uint64_t test_start_time; //TA_GUARDED(mutex)
54
55// True if tests are running.
56// Set by watchdog initialize(), reset by watchdog_terminate().
57static bool tests_running; //TA_GUARDED(mutex)
58
59static pthread_t watchdog_thread;
60
61// This library is used for both the host and target.
62// For portability concerns we use pthread_cond_timedwait to get a
63// cancelable wait.
64static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
65
66static uint64_t timespec_to_nanoseconds(const struct timespec* ts) {
67    return (ts->tv_sec * NANOSECONDS_PER_SECOND) + ts->tv_nsec;
68}
69
70/**
71 * Set the base timeout.
72 * |timeout| must be >= 0.
73 * A value of zero disables the timeout.
74 * The timeout must be set before calling watchdog_initialize(), and must
75 * not be changed until after watchdog_terminate() is called.
76 */
77void watchdog_set_base_timeout(int seconds) {
78    assert(seconds >= 0);
79    base_timeout_seconds = seconds;
80}
81
82static int test_timeout_for_type(test_type_t type) {
83    int factor;
84
85    switch (type) {
86    case TEST_SMALL:
87        factor = TEST_TIMEOUT_FACTOR_SMALL;
88        break;
89    case TEST_MEDIUM:
90        factor = TEST_TIMEOUT_FACTOR_MEDIUM;
91        break;
92    case TEST_LARGE:
93        factor = TEST_TIMEOUT_FACTOR_LARGE;
94        break;
95    case TEST_PERFORMANCE:
96        factor = TEST_TIMEOUT_FACTOR_PERFORMANCE;
97        break;
98    default:
99        __UNREACHABLE;
100    }
101
102    int64_t timeout = base_timeout_seconds * factor;
103    if (timeout > INT_MAX)
104        timeout = INT_MAX;
105    return static_cast<int>(timeout);
106}
107
108/**
109 * Return true if watchdog support is enabled for this test run.
110 */
111bool watchdog_is_enabled() {
112    return base_timeout_seconds > 0;
113}
114
115static __NO_RETURN void watchdog_signal_timeout(const char* name) {
116    unittest_printf_critical("\n\n*** WATCHDOG TIMER FIRED, test: %s ***\n", name);
117    exit(WATCHDOG_ERRCODE);
118}
119
120static void* watchdog_thread_func(void* arg) {
121    pthread_mutex_lock(&mutex);
122
123    for (;;) {
124        // Has watchdog_terminate() been called?
125        // Test this here, before calling pthread_cond_timedwait(), so that
126        // we catch the case of all tests completing and watchdog_terminate
127        // being called before we get started. Otherwise we'll wait one tick
128        // before we notice this.
129        if (!tests_running) {
130            pthread_mutex_unlock(&mutex);
131            break;
132        }
133
134        struct timespec delay;
135        clock_gettime(CLOCK_REALTIME, &delay);
136        delay.tv_sec += WATCHDOG_TICK_SECONDS;
137        // If compiled with #define NDEBUG the assert essentially goes away.
138        // Thus we need to protect |result| with __UNUSED lest the compiler
139        // complain and fail the build.
140        auto result __UNUSED = pthread_cond_timedwait(&cond, &mutex, &delay);
141        // We can time-out just as watchdog_terminate() is called, and
142        // thus we can't make any assumptions based on |result|.
143        assert(result == 0 || result == ETIMEDOUT);
144
145        struct timespec now;
146        clock_gettime(CLOCK_REALTIME, &now);
147        uint64_t now_nanos = timespec_to_nanoseconds(&now);
148        assert (now_nanos >= test_start_time);
149        uint64_t elapsed_nanos = now_nanos - test_start_time;
150
151        // Note: We skip worrying about handling the (rare) case where the
152        // test completes but before it can notify us we wake and see that
153        // the timeout has been reached.
154        uint64_t timeout_nanos = active_timeout_seconds * NANOSECONDS_PER_SECOND;
155        if (elapsed_nanos >= timeout_nanos) {
156            pthread_mutex_unlock(&mutex);
157            watchdog_signal_timeout(test_name);
158            /* NOTREACHED */
159        }
160    }
161
162    return nullptr;
163}
164
165/**
166 * Start the watchdog thread.
167 *
168 * The thread begins in an idle state, waiting for watchdog_start().
169 * This must only be called once.
170 */
171void watchdog_initialize() {
172    if (watchdog_is_enabled()) {
173        tests_running = true;
174        int res = pthread_create(&watchdog_thread, NULL, &watchdog_thread_func, NULL);
175        if (res != 0) {
176            unittest_printf_critical("ERROR STARTING WATCHDOG THREAD: %d(%s)\n", res, strerror(res));
177            exit(WATCHDOG_ERRCODE);
178        }
179    }
180}
181
182/**
183 * Turn on the watchdog timer for test |name|.
184 *
185 * Storage for |name| must survive the duration of the test.
186 *
187 * If the timer goes off the process terminates.
188 * This must be called at the start of a test.
189 */
190void watchdog_start(test_type_t type, const char* name) {
191    if (watchdog_is_enabled()) {
192        pthread_mutex_lock(&mutex);
193        test_name = name;
194        active_timeout_seconds = test_timeout_for_type(type);
195        struct timespec now;
196        clock_gettime(CLOCK_REALTIME, &now);
197        test_start_time = timespec_to_nanoseconds(&now);
198        pthread_mutex_unlock(&mutex);
199    }
200}
201
202/**
203 * Call this to turn off the watchdog timer.
204 *
205 * Yeah, there's a "race" if a test finishes right when we're called.
206 * We don't worry about this small window given the amount of time we wait.
207 * This must be called after watchdog_start().
208 */
209void watchdog_cancel() {
210    if (watchdog_is_enabled()) {
211        pthread_mutex_lock(&mutex);
212        active_timeout_seconds = WATCHDOG_TIMEOUT_NOT_RUNNING;
213        test_name = nullptr;
214        pthread_mutex_unlock(&mutex);
215    }
216}
217
218/**
219 * Terminate the watchdog thread.
220 *
221 * This must be called after all tests complete.
222 */
223void watchdog_terminate() {
224    // All tests must have completed.
225    assert(active_timeout_seconds == WATCHDOG_TIMEOUT_NOT_RUNNING);
226
227    if (watchdog_is_enabled()) {
228        pthread_mutex_lock(&mutex);
229        tests_running = false;
230        int res = pthread_cond_signal(&cond);
231        assert(res == 0);
232        pthread_mutex_unlock(&mutex);
233        res = pthread_join(watchdog_thread, NULL);
234        assert(res == 0);
235    }
236}
237