1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 2005-2009 Oracle. All rights reserved. 5 * 6 * $Id$ 7 */ 8 9#include "db_config.h" 10 11#define __INCLUDE_NETWORKING 1 12#include "db_int.h" 13 14static int __repmgr_is_ready __P((ENV *)); 15static int __repmgr_elect_main __P((ENV *)); 16static void *__repmgr_elect_thread __P((void *)); 17static int start_election_thread __P((ENV *)); 18 19/* 20 * Starts the election thread, or wakes up an existing one, starting off with 21 * the specified operation (an election, or a call to rep_start(CLIENT), or 22 * nothing). Avoid multiple concurrent elections. 23 * 24 * PUBLIC: int __repmgr_init_election __P((ENV *, int)); 25 * 26 * !!! 27 * Caller must hold mutex. 28 */ 29int 30__repmgr_init_election(env, initial_operation) 31 ENV *env; 32 int initial_operation; 33{ 34 DB_REP *db_rep; 35 int ret; 36 37 db_rep = env->rep_handle; 38 if (db_rep->finished) { 39 RPRINT(env, DB_VERB_REPMGR_MISC, (env, 40 "ignoring elect thread request %d; repmgr is finished", 41 initial_operation)); 42 return (0); 43 } 44 45 db_rep->operation_needed = initial_operation; 46 if (db_rep->elect_thread == NULL) 47 ret = start_election_thread(env); 48 else if (db_rep->elect_thread->finished) { 49 RPRINT(env, DB_VERB_REPMGR_MISC, 50 (env, "join dead elect thread")); 51 if ((ret = __repmgr_thread_join(db_rep->elect_thread)) != 0) 52 return (ret); 53 __os_free(env, db_rep->elect_thread); 54 db_rep->elect_thread = NULL; 55 ret = start_election_thread(env); 56 } else { 57 RPRINT(env, DB_VERB_REPMGR_MISC, 58 (env, "reusing existing elect thread")); 59 if ((ret = __repmgr_signal(&db_rep->check_election)) != 0) 60 __db_err(env, ret, "can't signal election thread"); 61 } 62 return (ret); 63} 64 65/* 66 * !!! 67 * Caller holds mutex. 68 */ 69static int 70start_election_thread(env) 71 ENV *env; 72{ 73 DB_REP *db_rep; 74 REPMGR_RUNNABLE *elector; 75 int ret; 76 77 db_rep = env->rep_handle; 78 79 if ((ret = __os_malloc(env, sizeof(REPMGR_RUNNABLE), &elector)) 80 != 0) 81 return (ret); 82 elector->env = env; 83 elector->run = __repmgr_elect_thread; 84 85 if ((ret = __repmgr_thread_start(env, elector)) == 0) 86 db_rep->elect_thread = elector; 87 else 88 __os_free(env, elector); 89 90 return (ret); 91} 92 93static void * 94__repmgr_elect_thread(args) 95 void *args; 96{ 97 ENV *env = args; 98 int ret; 99 100 RPRINT(env, DB_VERB_REPMGR_MISC, (env, "starting election thread")); 101 102 if ((ret = __repmgr_elect_main(env)) != 0) { 103 __db_err(env, ret, "election thread failed"); 104 __repmgr_thread_failure(env, ret); 105 } 106 107 RPRINT(env, DB_VERB_REPMGR_MISC, (env, "election thread is exiting")); 108 return (NULL); 109} 110 111static int 112__repmgr_elect_main(env) 113 ENV *env; 114{ 115 DBT my_addr; 116 DB_REP *db_rep; 117#ifdef DB_WIN32 118 DWORD duration; 119#else 120 struct timespec deadline; 121#endif 122 u_int32_t nsites, nvotes; 123 int done, failure_recovery, last_op; 124 int need_success, ret, succeeded, to_do; 125 126 COMPQUIET(need_success, TRUE); 127 128 db_rep = env->rep_handle; 129 last_op = 0; 130 failure_recovery = succeeded = FALSE; 131 132 /* 133 * db_rep->operation_needed is the mechanism by which the outside world 134 * (running in a different thread) tells us what it wants us to do. It 135 * is obviously relevant when we're just starting up. But it can also 136 * be set if a subsequent request for us to do something occurs while 137 * we're still looping. 138 * 139 * ELECT_FAILURE_ELECTION asks us to start by doing an election, but to 140 * do so in failure recovery mode. This failure recovery mode may 141 * persist through several loop iterations: as long as it takes us to 142 * succeed in finding a master, or until we get asked to perform a new 143 * request. Thus the time for mapping ELECT_FAILURE_ELECTION to the 144 * internal ELECT_ELECTION, as well as the setting of the failure 145 * recovery flag, is at the point we receive the new request from 146 * operation_needed (either here, or within the loop below). 147 */ 148 LOCK_MUTEX(db_rep->mutex); 149 if (db_rep->finished) { 150 db_rep->elect_thread->finished = TRUE; 151 UNLOCK_MUTEX(db_rep->mutex); 152 return (0); 153 } 154 to_do = db_rep->operation_needed; 155 db_rep->operation_needed = 0; 156 UNLOCK_MUTEX(db_rep->mutex); 157 158 /* 159 * The way we are invoked determines the criterion for completion (which 160 * is represented as "need_success"): if we've been asked to do an 161 * election, we're only "done" when an election has actually succeeded. 162 * If we're just here trying to find the master initially, then merely 163 * getting a valid master_eid suffices. 164 */ 165 switch (to_do) { 166 case ELECT_FAILURE_ELECTION: 167 failure_recovery = TRUE; 168 to_do = ELECT_ELECTION; 169 /* FALLTHROUGH */ 170 case ELECT_ELECTION: 171 need_success = TRUE; 172 break; 173 case ELECT_REPSTART: 174 need_success = FALSE; 175 break; 176 default: 177 DB_ASSERT(env, FALSE); 178 } 179 /* Here, need_success has been initialized. */ 180 181 for (;;) { 182 RPRINT(env, DB_VERB_REPMGR_MISC, 183 (env, "elect thread to do: %d", to_do)); 184 switch (to_do) { 185 case ELECT_ELECTION: 186 nsites = __repmgr_get_nsites(db_rep); 187 /* 188 * With only 2 sites in the group, even a single failure 189 * could make it impossible to get a majority. So, 190 * fudge a little, unless the user really wants strict 191 * safety. 192 */ 193 if (nsites == 2 && 194 !FLD_ISSET(db_rep->region->config, 195 REP_C_2SITE_STRICT)) 196 nvotes = 1; 197 else 198 nvotes = ELECTION_MAJORITY(nsites); 199 200 /* 201 * If we're doing an election because we noticed that 202 * the master failed, it's reasonable to expect that the 203 * master won't participate. By not waiting for its 204 * vote, we can probably complete the election faster. 205 * But note that we shouldn't allow this to affect 206 * nvotes calculation. 207 * 208 * However, if we have 2 sites, and strict majority is 209 * turned on, now nvotes would be 2, and it doesn't make 210 * sense to rep_elect to see nsites of 1 in that case. 211 * So only decrement nsites if it currently exceeds 212 * nvotes. 213 */ 214 if (failure_recovery && nsites > nvotes) 215 nsites--; 216 217 if (IS_USING_LEASES(env)) 218 nsites = 0; 219 220 switch (ret = 221 __rep_elect_int(env, nsites, nvotes, 0)) { 222 case DB_REP_UNAVAIL: 223 break; 224 225 case 0: 226 succeeded = TRUE; 227 if (db_rep->takeover_pending) { 228 db_rep->takeover_pending = FALSE; 229 if ((ret = 230 __repmgr_become_master(env)) != 0) 231 return (ret); 232 } 233 break; 234 235 default: 236 __db_err( 237 env, ret, "unexpected election failure"); 238 return (ret); 239 } 240 last_op = ELECT_ELECTION; 241 break; 242 case ELECT_REPSTART: 243 if ((ret = 244 __repmgr_prepare_my_addr(env, &my_addr)) != 0) 245 return (ret); 246 ret = __rep_start_int(env, &my_addr, DB_REP_CLIENT); 247 __os_free(env, my_addr.data); 248 if (ret != 0) { 249 __db_err(env, ret, "rep_start"); 250 return (ret); 251 } 252 last_op = ELECT_REPSTART; 253 break; 254 case 0: 255 /* 256 * Nothing to do: this can happen the first time 257 * through, on initialization. 258 */ 259 last_op = 0; 260 break; 261 default: 262 DB_ASSERT(env, FALSE); 263 } 264 265 /* 266 * Only the first election after a crashed master should be 267 * "fast". If that election fails and we have to retry, the 268 * crashed master may have rebooted in the interim. 269 */ 270 failure_recovery = FALSE; 271 272 LOCK_MUTEX(db_rep->mutex); 273 while (!succeeded && !__repmgr_is_ready(env)) { 274#ifdef DB_WIN32 275 duration = db_rep->election_retry_wait / US_PER_MS; 276 ret = SignalObjectAndWait(*db_rep->mutex, 277 db_rep->check_election, duration, FALSE); 278 LOCK_MUTEX(db_rep->mutex); 279 if (ret == WAIT_TIMEOUT) 280 break; 281 DB_ASSERT(env, ret == WAIT_OBJECT_0); 282#else 283 __repmgr_compute_wait_deadline(env, &deadline, 284 db_rep->election_retry_wait); 285 if ((ret = pthread_cond_timedwait( 286 &db_rep->check_election, db_rep->mutex, &deadline)) 287 == ETIMEDOUT) 288 break; 289 DB_ASSERT(env, ret == 0); 290#endif 291 } 292 293 /* 294 * Ways we can get here: election succeeded, sleep duration 295 * expired, "operation needed", or thread shut-down command. 296 * 297 * If we're not yet done, figure out what to do next (which may 298 * be trivially easy if we've been told explicitly, via the 299 * "operation needed" flag). We must first check if we've been 300 * told to do a specific operation, because that could make our 301 * completion criterion more stringent. Note that we never 302 * lessen our completion criterion (i.e., unlike the initial 303 * case, we may leave need_success untouched here). 304 */ 305 done = FALSE; 306 if ((to_do = db_rep->operation_needed) != 0) { 307 db_rep->operation_needed = 0; 308 switch (to_do) { 309 case ELECT_FAILURE_ELECTION: 310 failure_recovery = TRUE; 311 to_do = ELECT_ELECTION; 312 /* FALLTHROUGH */ 313 case ELECT_ELECTION: 314 need_success = TRUE; 315 break; 316 default: 317 break; 318 } 319 } else if ((done = (succeeded || 320 (!need_success && IS_VALID_EID(db_rep->master_eid)) || 321 db_rep->finished))) 322 db_rep->elect_thread->finished = TRUE; 323 else { 324 if (last_op == ELECT_ELECTION) 325 to_do = ELECT_REPSTART; 326 else { 327 /* 328 * Generally, if what we previously did is a 329 * rep_start (or nothing, which really just 330 * means another thread did the rep_start before 331 * turning us on), then we next do an election. 332 * However, with the REP_CLIENT init policy we 333 * never do an initial election. 334 */ 335 to_do = ELECT_ELECTION; 336 if (db_rep->init_policy == DB_REP_CLIENT && 337 !db_rep->found_master) 338 to_do = ELECT_REPSTART; 339 } 340 } 341 342 UNLOCK_MUTEX(db_rep->mutex); 343 if (done) 344 return (0); 345 } 346} 347 348/* 349 * Tests whether another thread has signalled for our attention. 350 */ 351static int 352__repmgr_is_ready(env) 353 ENV *env; 354{ 355 DB_REP *db_rep; 356 357 db_rep = env->rep_handle; 358 359 RPRINT(env, DB_VERB_REPMGR_MISC, (env, 360 "repmgr elect: opcode %d, finished %d, master %d", 361 db_rep->operation_needed, db_rep->finished, db_rep->master_eid)); 362 363 return (db_rep->operation_needed || db_rep->finished); 364} 365 366/* 367 * PUBLIC: int __repmgr_become_master __P((ENV *)); 368 */ 369int 370__repmgr_become_master(env) 371 ENV *env; 372{ 373 DBT my_addr; 374 DB_REP *db_rep; 375 int ret; 376 377 db_rep = env->rep_handle; 378 379 /* 380 * At the moment, it's useless to pass my address to rep_start here, 381 * because rep_start ignores it in the case of MASTER. So we could 382 * avoid the trouble of allocating and freeing this memory. But might 383 * this conceivably change in the future? 384 */ 385 if ((ret = __repmgr_prepare_my_addr(env, &my_addr)) != 0) 386 return (ret); 387 ret = __rep_start_int(env, &my_addr, DB_REP_MASTER); 388 __os_free(env, my_addr.data); 389 390 if (ret == 0) { 391 db_rep->master_eid = SELF_EID; 392 db_rep->found_master = TRUE; 393 } 394 return (ret); 395} 396