1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 2005,2008 Oracle. All rights reserved. 5 * 6 * $Id: repmgr_elect.c,v 1.41 2008/03/13 17:31:28 mbrey Exp $ 7 */ 8 9#include "db_config.h" 10 11#define __INCLUDE_NETWORKING 1 12#include "db_int.h" 13 14static int __repmgr_is_ready __P((ENV *)); 15static int __repmgr_elect_main __P((ENV *)); 16static void *__repmgr_elect_thread __P((void *)); 17static int start_election_thread __P((ENV *)); 18 19/* 20 * Starts the election thread, or wakes up an existing one, starting off with 21 * the specified operation (an election, or a call to rep_start(CLIENT), or 22 * nothing). Avoid multiple concurrent elections. 23 * 24 * PUBLIC: int __repmgr_init_election __P((ENV *, int)); 25 * 26 * !!! 27 * Caller must hold mutex. 28 */ 29int 30__repmgr_init_election(env, initial_operation) 31 ENV *env; 32 int initial_operation; 33{ 34 DB_REP *db_rep; 35 int ret; 36 37 db_rep = env->rep_handle; 38 if (db_rep->finished) { 39 RPRINT(env, DB_VERB_REPMGR_MISC, (env, 40 "ignoring elect thread request %d; repmgr is finished", 41 initial_operation)); 42 return (0); 43 } 44 45 db_rep->operation_needed = initial_operation; 46 if (db_rep->elect_thread == NULL) 47 ret = start_election_thread(env); 48 else if (db_rep->elect_thread->finished) { 49 RPRINT(env, DB_VERB_REPMGR_MISC, 50 (env, "join dead elect thread")); 51 if ((ret = __repmgr_thread_join(db_rep->elect_thread)) != 0) 52 return (ret); 53 __os_free(env, db_rep->elect_thread); 54 db_rep->elect_thread = NULL; 55 ret = start_election_thread(env); 56 } else { 57 RPRINT(env, DB_VERB_REPMGR_MISC, 58 (env, "reusing existing elect thread")); 59 if ((ret = __repmgr_signal(&db_rep->check_election)) != 0) 60 __db_err(env, ret, "can't signal election thread"); 61 } 62 return (ret); 63} 64 65/* 66 * !!! 67 * Caller holds mutex. 68 */ 69static int 70start_election_thread(env) 71 ENV *env; 72{ 73 DB_REP *db_rep; 74 REPMGR_RUNNABLE *elector; 75 int ret; 76 77 db_rep = env->rep_handle; 78 79 if ((ret = __os_malloc(env, sizeof(REPMGR_RUNNABLE), &elector)) 80 != 0) 81 return (ret); 82 elector->env = env; 83 elector->run = __repmgr_elect_thread; 84 85 if ((ret = __repmgr_thread_start(env, elector)) == 0) 86 db_rep->elect_thread = elector; 87 else 88 __os_free(env, elector); 89 90 return (ret); 91} 92 93static void * 94__repmgr_elect_thread(args) 95 void *args; 96{ 97 ENV *env = args; 98 int ret; 99 100 RPRINT(env, DB_VERB_REPMGR_MISC, (env, "starting election thread")); 101 102 if ((ret = __repmgr_elect_main(env)) != 0) { 103 __db_err(env, ret, "election thread failed"); 104 __repmgr_thread_failure(env, ret); 105 } 106 107 RPRINT(env, DB_VERB_REPMGR_MISC, (env, "election thread is exiting")); 108 return (NULL); 109} 110 111static int 112__repmgr_elect_main(env) 113 ENV *env; 114{ 115 DBT my_addr; 116 DB_ENV *dbenv; 117 DB_REP *db_rep; 118#ifdef DB_WIN32 119 DWORD duration; 120#else 121 struct timespec deadline; 122#endif 123 u_int32_t nsites, nvotes; 124 int done, failure_recovery, last_op; 125 int need_success, ret, succeeded, to_do; 126 127 COMPQUIET(need_success, TRUE); 128 129 dbenv = env->dbenv; 130 db_rep = env->rep_handle; 131 last_op = 0; 132 failure_recovery = succeeded = FALSE; 133 134 /* 135 * db_rep->operation_needed is the mechanism by which the outside world 136 * (running in a different thread) tells us what it wants us to do. It 137 * is obviously relevant when we're just starting up. But it can also 138 * be set if a subsequent request for us to do something occurs while 139 * we're still looping. 140 * 141 * ELECT_FAILURE_ELECTION asks us to start by doing an election, but to 142 * do so in failure recovery mode. This failure recovery mode may 143 * persist through several loop iterations: as long as it takes us to 144 * succeed in finding a master, or until we get asked to perform a new 145 * request. Thus the time for mapping ELECT_FAILURE_ELECTION to the 146 * internal ELECT_ELECTION, as well as the setting of the failure 147 * recovery flag, is at the point we receive the new request from 148 * operation_needed (either here, or within the loop below). 149 */ 150 LOCK_MUTEX(db_rep->mutex); 151 if (db_rep->finished) { 152 db_rep->elect_thread->finished = TRUE; 153 UNLOCK_MUTEX(db_rep->mutex); 154 return (0); 155 } 156 to_do = db_rep->operation_needed; 157 db_rep->operation_needed = 0; 158 UNLOCK_MUTEX(db_rep->mutex); 159 160 /* 161 * The way we are invoked determines the criterion for completion (which 162 * is represented as "need_success"): if we've been asked to do an 163 * election, we're only "done" when an election has actually succeeded. 164 * If we're just here trying to find the master initially, then merely 165 * getting a valid master_eid suffices. 166 */ 167 switch (to_do) { 168 case ELECT_FAILURE_ELECTION: 169 failure_recovery = TRUE; 170 to_do = ELECT_ELECTION; 171 /* FALLTHROUGH */ 172 case ELECT_ELECTION: 173 need_success = TRUE; 174 break; 175 case ELECT_SEEK_MASTER: 176 to_do = 0; /* Caller has already called rep_start. */ 177 /* FALLTHROUGH */ 178 case ELECT_REPSTART: 179 need_success = FALSE; 180 break; 181 default: 182 DB_ASSERT(env, FALSE); 183 } 184 /* Here, need_success has been initialized. */ 185 186 for (;;) { 187 RPRINT(env, DB_VERB_REPMGR_MISC, 188 (env, "elect thread to do: %d", to_do)); 189 switch (to_do) { 190 case ELECT_ELECTION: 191 nsites = __repmgr_get_nsites(db_rep); 192 /* 193 * With only 2 sites in the group, even a single failure 194 * could make it impossible to get a majority. So, 195 * fudge a little, unless the user really wants strict 196 * safety. 197 */ 198 if (nsites == 2 && 199 !FLD_ISSET(db_rep->region->config, 200 REP_C_2SITE_STRICT)) 201 nvotes = 1; 202 else 203 nvotes = ELECTION_MAJORITY(nsites); 204 205 /* 206 * If we're doing an election because we noticed that 207 * the master failed, it's reasonable to expect that the 208 * master won't participate. By not waiting for its 209 * vote, we can probably complete the election faster. 210 * But note that we shouldn't allow this to affect 211 * nvotes calculation. 212 * 213 * However, if we have 2 sites, and strict majority is 214 * turned on, now nvotes would be 2, and it doesn't make 215 * sense to rep_elect to see nsites of 1 in that case. 216 * So only decrement nsites if it currently exceeds 217 * nvotes. 218 */ 219 if (failure_recovery && nsites > nvotes) 220 nsites--; 221 222 switch (ret = 223 __rep_elect(dbenv, nsites, nvotes, 0)) { 224 case DB_REP_UNAVAIL: 225 break; 226 227 case 0: 228 succeeded = TRUE; 229 if (db_rep->takeover_pending) { 230 db_rep->takeover_pending = FALSE; 231 if ((ret = 232 __repmgr_become_master(env)) != 0) 233 return (ret); 234 } 235 break; 236 237 default: 238 __db_err( 239 env, ret, "unexpected election failure"); 240 return (ret); 241 } 242 last_op = ELECT_ELECTION; 243 break; 244 case ELECT_REPSTART: 245 if ((ret = 246 __repmgr_prepare_my_addr(env, &my_addr)) != 0) 247 return (ret); 248 ret = __rep_start(dbenv, &my_addr, DB_REP_CLIENT); 249 __os_free(env, my_addr.data); 250 if (ret != 0) { 251 __db_err(env, ret, "rep_start"); 252 return (ret); 253 } 254 last_op = ELECT_REPSTART; 255 break; 256 case 0: 257 /* 258 * Nothing to do: this can happen the first time 259 * through, on initialization. 260 */ 261 last_op = 0; 262 break; 263 default: 264 DB_ASSERT(env, FALSE); 265 } 266 267 /* 268 * Only the first election after a crashed master should be 269 * "fast". If that election fails and we have to retry, the 270 * crashed master may have rebooted in the interim. 271 */ 272 failure_recovery = FALSE; 273 274 LOCK_MUTEX(db_rep->mutex); 275 while (!succeeded && !__repmgr_is_ready(env)) { 276#ifdef DB_WIN32 277 duration = db_rep->election_retry_wait / US_PER_MS; 278 ret = SignalObjectAndWait(db_rep->mutex, 279 db_rep->check_election, duration, FALSE); 280 LOCK_MUTEX(db_rep->mutex); 281 if (ret == WAIT_TIMEOUT) 282 break; 283 DB_ASSERT(env, ret == WAIT_OBJECT_0); 284#else 285 __repmgr_compute_wait_deadline(env, &deadline, 286 db_rep->election_retry_wait); 287 if ((ret = pthread_cond_timedwait( 288 &db_rep->check_election, &db_rep->mutex, &deadline)) 289 == ETIMEDOUT) 290 break; 291 DB_ASSERT(env, ret == 0); 292#endif 293 } 294 295 /* 296 * Ways we can get here: election succeeded, sleep duration 297 * expired, "operation needed", or thread shut-down command. 298 * 299 * If we're not yet done, figure out what to do next (which may 300 * be trivially easy if we've been told explicitly, via the 301 * "operation needed" flag). We must first check if we've been 302 * told to do a specific operation, because that could make our 303 * completion criterion more stringent. Note that we never 304 * lessen our completion criterion (i.e., unlike the initial 305 * case, we may leave need_success untouched here). 306 */ 307 done = FALSE; 308 if ((to_do = db_rep->operation_needed) != 0) { 309 db_rep->operation_needed = 0; 310 switch (to_do) { 311 case ELECT_FAILURE_ELECTION: 312 failure_recovery = TRUE; 313 to_do = ELECT_ELECTION; 314 /* FALLTHROUGH */ 315 case ELECT_ELECTION: 316 need_success = TRUE; 317 break; 318 case ELECT_SEEK_MASTER: 319 to_do = 0; 320 break; 321 default: 322 break; 323 } 324 } else if ((done = (succeeded || 325 (!need_success && IS_VALID_EID(db_rep->master_eid)) || 326 db_rep->finished))) 327 db_rep->elect_thread->finished = TRUE; 328 else { 329 if (last_op == ELECT_ELECTION) 330 to_do = ELECT_REPSTART; 331 else { 332 /* 333 * Generally, if what we previously did is a 334 * rep_start (or nothing, which really just 335 * means another thread did the rep_start before 336 * turning us on), then we next do an election. 337 * However, with the REP_CLIENT init policy we 338 * never do an initial election. 339 */ 340 to_do = ELECT_ELECTION; 341 if (db_rep->init_policy == DB_REP_CLIENT && 342 !db_rep->found_master) 343 to_do = ELECT_REPSTART; 344 } 345 } 346 347 UNLOCK_MUTEX(db_rep->mutex); 348 if (done) 349 return (0); 350 } 351} 352 353/* 354 * Tests whether another thread has signalled for our attention. 355 */ 356static int 357__repmgr_is_ready(env) 358 ENV *env; 359{ 360 DB_REP *db_rep; 361 362 db_rep = env->rep_handle; 363 364 RPRINT(env, DB_VERB_REPMGR_MISC, (env, 365 "repmgr elect: opcode %d, finished %d, master %d", 366 db_rep->operation_needed, db_rep->finished, db_rep->master_eid)); 367 368 return (db_rep->operation_needed || db_rep->finished); 369} 370 371/* 372 * PUBLIC: int __repmgr_become_master __P((ENV *)); 373 */ 374int 375__repmgr_become_master(env) 376 ENV *env; 377{ 378 DBT my_addr; 379 DB_ENV *dbenv; 380 DB_REP *db_rep; 381 int ret; 382 383 dbenv = env->dbenv; 384 db_rep = env->rep_handle; 385 db_rep->master_eid = SELF_EID; 386 db_rep->found_master = TRUE; 387 388 /* 389 * At the moment, it's useless to pass my address to rep_start here, 390 * because rep_start ignores it in the case of MASTER. So we could 391 * avoid the trouble of allocating and freeing this memory. But might 392 * this conceivably change in the future? 393 */ 394 if ((ret = __repmgr_prepare_my_addr(env, &my_addr)) != 0) 395 return (ret); 396 ret = __rep_start(dbenv, &my_addr, DB_REP_MASTER); 397 __os_free(env, my_addr.data); 398 if (ret == 0) 399 __repmgr_stash_generation(env); 400 401 return (ret); 402} 403