1// Copyright 2018 The Fuchsia Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "watchdog.h" 6 7#include <assert.h> 8#include <errno.h> 9#include <inttypes.h> 10#include <limits.h> 11#include <pthread.h> 12#include <string.h> 13#include <time.h> 14 15#include <unittest/unittest.h> 16 17constexpr int WATCHDOG_ERRCODE = 5; 18 19constexpr uint64_t NANOSECONDS_PER_SECOND = 1000 * 1000 * 1000; 20 21// The watchdog thread wakes up after this many seconds to check whether 22// a test has timed out. The lower the number this is the more accurate 23// the watchdog is with regard to the specified timeout. But there's 24// no point in running too frequently. The wait mechanism we use is 25// interruptible, so this value can be high and there's no worries of waiting 26// for the watchdog to terminate. The watchdog works this way so that we 27// can have one watchdog thread that is continuously running instead of 28// starting a new watchdog thread for each test. Another goal is to not 29// require any synchronization between the watchdog thread and the test. 30// E.g., We don't want to have to wait for the watchdog to acknowledge that 31// a test is starting and stopping. Instead we just let it run at its own pace. 32// Tests often complete in milliseconds, far lower than our "tick". 33constexpr int WATCHDOG_TICK_SECONDS = 1; 34 35// Value stored in |active_timeout_seconds| to indicate test is not running. 36constexpr int WATCHDOG_TIMEOUT_NOT_RUNNING = INT_MAX; 37 38// This can be overridden by the user by setting env var WATCHDOG_ENV_NAME. 39static int base_timeout_seconds = DEFAULT_BASE_TIMEOUT_SECONDS; 40 41static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; 42 43// The name of the current test. 44// Used to report which test timed out. 45static const char* test_name; //TA_GUARDED(mutex) 46 47// The current timeout in effect. 48// When tests aren't running we set this to INT_MAX. 49static int active_timeout_seconds = WATCHDOG_TIMEOUT_NOT_RUNNING; //TA_GUARDED(mutex) 50 51// The time when the test was started. 52// This is the result of clock_gettime converted to nanoseconds. 53static uint64_t test_start_time; //TA_GUARDED(mutex) 54 55// True if tests are running. 56// Set by watchdog initialize(), reset by watchdog_terminate(). 57static bool tests_running; //TA_GUARDED(mutex) 58 59static pthread_t watchdog_thread; 60 61// This library is used for both the host and target. 62// For portability concerns we use pthread_cond_timedwait to get a 63// cancelable wait. 64static pthread_cond_t cond = PTHREAD_COND_INITIALIZER; 65 66static uint64_t timespec_to_nanoseconds(const struct timespec* ts) { 67 return (ts->tv_sec * NANOSECONDS_PER_SECOND) + ts->tv_nsec; 68} 69 70/** 71 * Set the base timeout. 72 * |timeout| must be >= 0. 73 * A value of zero disables the timeout. 74 * The timeout must be set before calling watchdog_initialize(), and must 75 * not be changed until after watchdog_terminate() is called. 76 */ 77void watchdog_set_base_timeout(int seconds) { 78 assert(seconds >= 0); 79 base_timeout_seconds = seconds; 80} 81 82static int test_timeout_for_type(test_type_t type) { 83 int factor; 84 85 switch (type) { 86 case TEST_SMALL: 87 factor = TEST_TIMEOUT_FACTOR_SMALL; 88 break; 89 case TEST_MEDIUM: 90 factor = TEST_TIMEOUT_FACTOR_MEDIUM; 91 break; 92 case TEST_LARGE: 93 factor = TEST_TIMEOUT_FACTOR_LARGE; 94 break; 95 case TEST_PERFORMANCE: 96 factor = TEST_TIMEOUT_FACTOR_PERFORMANCE; 97 break; 98 default: 99 __UNREACHABLE; 100 } 101 102 int64_t timeout = base_timeout_seconds * factor; 103 if (timeout > INT_MAX) 104 timeout = INT_MAX; 105 return static_cast<int>(timeout); 106} 107 108/** 109 * Return true if watchdog support is enabled for this test run. 110 */ 111bool watchdog_is_enabled() { 112 return base_timeout_seconds > 0; 113} 114 115static __NO_RETURN void watchdog_signal_timeout(const char* name) { 116 unittest_printf_critical("\n\n*** WATCHDOG TIMER FIRED, test: %s ***\n", name); 117 exit(WATCHDOG_ERRCODE); 118} 119 120static void* watchdog_thread_func(void* arg) { 121 pthread_mutex_lock(&mutex); 122 123 for (;;) { 124 // Has watchdog_terminate() been called? 125 // Test this here, before calling pthread_cond_timedwait(), so that 126 // we catch the case of all tests completing and watchdog_terminate 127 // being called before we get started. Otherwise we'll wait one tick 128 // before we notice this. 129 if (!tests_running) { 130 pthread_mutex_unlock(&mutex); 131 break; 132 } 133 134 struct timespec delay; 135 clock_gettime(CLOCK_REALTIME, &delay); 136 delay.tv_sec += WATCHDOG_TICK_SECONDS; 137 // If compiled with #define NDEBUG the assert essentially goes away. 138 // Thus we need to protect |result| with __UNUSED lest the compiler 139 // complain and fail the build. 140 auto result __UNUSED = pthread_cond_timedwait(&cond, &mutex, &delay); 141 // We can time-out just as watchdog_terminate() is called, and 142 // thus we can't make any assumptions based on |result|. 143 assert(result == 0 || result == ETIMEDOUT); 144 145 struct timespec now; 146 clock_gettime(CLOCK_REALTIME, &now); 147 uint64_t now_nanos = timespec_to_nanoseconds(&now); 148 assert (now_nanos >= test_start_time); 149 uint64_t elapsed_nanos = now_nanos - test_start_time; 150 151 // Note: We skip worrying about handling the (rare) case where the 152 // test completes but before it can notify us we wake and see that 153 // the timeout has been reached. 154 uint64_t timeout_nanos = active_timeout_seconds * NANOSECONDS_PER_SECOND; 155 if (elapsed_nanos >= timeout_nanos) { 156 pthread_mutex_unlock(&mutex); 157 watchdog_signal_timeout(test_name); 158 /* NOTREACHED */ 159 } 160 } 161 162 return nullptr; 163} 164 165/** 166 * Start the watchdog thread. 167 * 168 * The thread begins in an idle state, waiting for watchdog_start(). 169 * This must only be called once. 170 */ 171void watchdog_initialize() { 172 if (watchdog_is_enabled()) { 173 tests_running = true; 174 int res = pthread_create(&watchdog_thread, NULL, &watchdog_thread_func, NULL); 175 if (res != 0) { 176 unittest_printf_critical("ERROR STARTING WATCHDOG THREAD: %d(%s)\n", res, strerror(res)); 177 exit(WATCHDOG_ERRCODE); 178 } 179 } 180} 181 182/** 183 * Turn on the watchdog timer for test |name|. 184 * 185 * Storage for |name| must survive the duration of the test. 186 * 187 * If the timer goes off the process terminates. 188 * This must be called at the start of a test. 189 */ 190void watchdog_start(test_type_t type, const char* name) { 191 if (watchdog_is_enabled()) { 192 pthread_mutex_lock(&mutex); 193 test_name = name; 194 active_timeout_seconds = test_timeout_for_type(type); 195 struct timespec now; 196 clock_gettime(CLOCK_REALTIME, &now); 197 test_start_time = timespec_to_nanoseconds(&now); 198 pthread_mutex_unlock(&mutex); 199 } 200} 201 202/** 203 * Call this to turn off the watchdog timer. 204 * 205 * Yeah, there's a "race" if a test finishes right when we're called. 206 * We don't worry about this small window given the amount of time we wait. 207 * This must be called after watchdog_start(). 208 */ 209void watchdog_cancel() { 210 if (watchdog_is_enabled()) { 211 pthread_mutex_lock(&mutex); 212 active_timeout_seconds = WATCHDOG_TIMEOUT_NOT_RUNNING; 213 test_name = nullptr; 214 pthread_mutex_unlock(&mutex); 215 } 216} 217 218/** 219 * Terminate the watchdog thread. 220 * 221 * This must be called after all tests complete. 222 */ 223void watchdog_terminate() { 224 // All tests must have completed. 225 assert(active_timeout_seconds == WATCHDOG_TIMEOUT_NOT_RUNNING); 226 227 if (watchdog_is_enabled()) { 228 pthread_mutex_lock(&mutex); 229 tests_running = false; 230 int res = pthread_cond_signal(&cond); 231 assert(res == 0); 232 pthread_mutex_unlock(&mutex); 233 res = pthread_join(watchdog_thread, NULL); 234 assert(res == 0); 235 } 236} 237