1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 2004,2008 Oracle. All rights reserved. 5 * 6 * $Id: rep_elect.c,v 12.78 2008/03/13 16:21:04 mbrey Exp $ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12#include "dbinc/db_page.h" 13#include "dbinc/db_am.h" 14#include "dbinc/log.h" 15 16/* 17 * We need to check sites == nsites, not more than half 18 * like we do in __rep_elect and the VOTE2 code. The 19 * reason is that we want to process all the incoming votes 20 * and not short-circuit once we reach more than half. The 21 * real winner's vote may be in the last half. 22 */ 23#define IS_PHASE1_DONE(rep) \ 24 ((rep)->sites >= (rep)->nsites && (rep)->w_priority > 0) 25 26#define I_HAVE_WON(rep, winner) \ 27 ((rep)->votes >= (rep)->nvotes && winner == (rep)->eid) 28 29static void __rep_cmp_vote __P((ENV *, REP *, int, DB_LSN *, 30 u_int32_t, u_int32_t, u_int32_t, u_int32_t)); 31static int __rep_elect_init 32 __P((ENV *, u_int32_t, u_int32_t, int *, u_int32_t *)); 33static int __rep_fire_elected __P((ENV *, REP *, u_int32_t)); 34static void __rep_elect_master __P((ENV *, REP *)); 35static int __rep_tally __P((ENV *, REP *, int, u_int32_t *, u_int32_t, roff_t)); 36static int __rep_wait __P((ENV *, db_timeout_t *, int *, int, u_int32_t)); 37 38/* 39 * __rep_elect -- 40 * Called after master failure to hold/participate in an election for 41 * a new master. 42 * 43 * PUBLIC: int __rep_elect __P((DB_ENV *, u_int32_t, u_int32_t, u_int32_t)); 44 */ 45int 46__rep_elect(dbenv, given_nsites, nvotes, flags) 47 DB_ENV *dbenv; 48 u_int32_t given_nsites, nvotes; 49 u_int32_t flags; 50{ 51 DB_LOG *dblp; 52 DB_LSN lsn; 53 DB_REP *db_rep; 54 DB_THREAD_INFO *ip; 55 ENV *env; 56 LOG *lp; 57 REP *rep; 58 int done, eid, elected, full_elect, locked, in_progress, need_req; 59 int ret, send_vote, t_ret; 60 u_int32_t ack, ctlflags, egen, nsites, orig_tally, priority, realpri; 61 u_int32_t tiebreaker; 62 db_timeout_t timeout, to; 63 64 env = dbenv->env; 65 66 COMPQUIET(flags, 0); 67 COMPQUIET(egen, 0); 68 69 ENV_REQUIRES_CONFIG_XX( 70 env, rep_handle, "DB_ENV->rep_elect", DB_INIT_REP); 71 72 /* Error checking. */ 73 if (IS_USING_LEASES(env) && given_nsites != 0) { 74 __db_errx(env, 75 "DB_ENV->rep_elect: nsites must be zero if leases configured"); 76 return (EINVAL); 77 } 78 79 db_rep = env->rep_handle; 80 rep = db_rep->region; 81 dblp = env->lg_handle; 82 lp = dblp->reginfo.primary; 83 elected = 0; 84 85 /* 86 * Specifying 0 for nsites signals us to use the value configured 87 * previously via rep_set_nsites. Similarly, if the given nvotes is 0, 88 * it asks us to compute the value representing a simple majority. 89 */ 90 nsites = given_nsites == 0 ? rep->config_nsites : given_nsites; 91 ack = nvotes == 0 ? ELECTION_MAJORITY(nsites) : nvotes; 92 locked = 0; 93 94 /* 95 * XXX 96 * If users give us less than a majority, they run the risk of 97 * having a network partition. However, this also allows the 98 * scenario of master/1 client to elect the client. Allow 99 * sub-majority values, but give a warning. 100 */ 101 if (ack <= (nsites / 2)) { 102 __db_errx(env, 103 "DB_ENV->rep_elect:WARNING: nvotes (%d) is sub-majority with nsites (%d)", 104 nvotes, nsites); 105 } 106 107 if (nsites < ack) { 108 __db_errx(env, 109 "DB_ENV->rep_elect: nvotes (%d) is larger than nsites (%d)", 110 ack, nsites); 111 return (EINVAL); 112 } 113 114 /* 115 * Default to the normal timeout unless the user configured 116 * a full election timeout and we think we need a full election. 117 */ 118 full_elect = 0; 119 timeout = rep->elect_timeout; 120 if (!F_ISSET(rep, REP_F_GROUP_ESTD) && rep->full_elect_timeout != 0) { 121 full_elect = 1; 122 timeout = rep->full_elect_timeout; 123 } 124 realpri = rep->priority; 125 126 RPRINT(env, DB_VERB_REP_ELECT, 127 (env, "Start election nsites %d, ack %d, priority %d", 128 nsites, ack, realpri)); 129 130 /* 131 * Special case when having an election while running with 132 * sites of potentially mixed versions. We set a bit indicating 133 * we're an electable site, but set our priority to 0. 134 * Old sites will never elect us, with 0 priority, but if all 135 * we have are new sites, then we can elect the best electable 136 * site of the group. 137 * Thus 'priority' is this special, possibly-fake, effective 138 * priority that we'll use for this election, while 'realpri' is our 139 * real, configured priority, as retrieved from REP region. 140 */ 141 ctlflags = realpri != 0 ? REPCTL_ELECTABLE : 0; 142 ENV_ENTER(env, ip); 143 144 orig_tally = 0; 145 if ((ret = __rep_elect_init(env, nsites, ack, 146 &in_progress, &orig_tally)) != 0) { 147 if (ret == DB_REP_NEWMASTER) 148 ret = 0; 149 goto err; 150 } 151 /* 152 * If another thread is in the middle of an election we 153 * just quietly return and not interfere. 154 */ 155 if (in_progress) 156 goto edone; 157 158 priority = lp->persist.version != DB_LOGVERSION ? 0 : realpri; 159#ifdef CONFIG_TEST 160 /* 161 * This allows us to unit test the ELECTABLE flag simply by 162 * using the priority values. 163 */ 164 if (priority > 0 && priority <= 5) { 165 RPRINT(env, DB_VERB_REP_ELECT, (env, 166 "Artificially setting priority 0 (ELECTABLE) for CONFIG_TEST mode")); 167 DB_ASSERT(env, ctlflags == REPCTL_ELECTABLE); 168 priority = 0; 169 } 170#endif 171 __os_gettime(env, &rep->etime, 1); 172 REP_SYSTEM_LOCK(env); 173 /* 174 * If leases are configured, wait for them to expire, and 175 * see if we can discover the master while waiting. 176 */ 177 if (IS_USING_LEASES(env)) { 178 to = __rep_lease_waittime(env); 179 if (to != 0) { 180 F_SET(rep, REP_F_EPHASE0); 181 REP_SYSTEM_UNLOCK(env); 182 (void)__rep_send_message(env, DB_EID_BROADCAST, 183 REP_MASTER_REQ, NULL, NULL, 0, 0); 184 ret = __rep_wait(env, &to, &eid, 185 0, REP_F_EPHASE0); 186 REP_SYSTEM_LOCK(env); 187 F_CLR(rep, REP_F_EPHASE0); 188 switch (ret) { 189 /* 190 * If waiting is successful, our flag is cleared 191 * and the master responded. We're done. 192 */ 193 case DB_REP_EGENCHG: 194 case 0: 195 REP_SYSTEM_UNLOCK(env); 196 goto edone; 197 /* 198 * If we get a timeout, continue with the election. 199 */ 200 case DB_TIMEOUT: 201 break; 202 default: 203 goto lockdone; 204 } 205 } 206 } 207 /* 208 * We need to lockout applying incoming log records during 209 * the election. We need to use a special rep_lockout_apply 210 * instead of rep_lockout_msg because we do not want to 211 * lockout all incoming messages, like other VOTEs! 212 */ 213 if ((ret = __rep_lockout_apply(env, rep, 0)) != 0) 214 goto lockdone; 215 locked = 1; 216 to = timeout; 217 REP_SYSTEM_UNLOCK(env); 218restart: 219 /* Generate a randomized tiebreaker value. */ 220 __os_unique_id(env, &tiebreaker); 221 LOG_SYSTEM_LOCK(env); 222 lsn = lp->lsn; 223 LOG_SYSTEM_UNLOCK(env); 224 REP_SYSTEM_LOCK(env); 225 226 F_SET(rep, REP_F_EPHASE1 | REP_F_NOARCHIVE); 227 F_CLR(rep, REP_F_TALLY); 228 /* 229 * We made sure that leases were expired before starting the 230 * election, but an existing master may be slow in responding. 231 * If, during lockout, acquiring mutexes, etc, the client has now 232 * re-granted its lease, we're done - a master exists. 233 */ 234 if (IS_USING_LEASES(env) && 235 __rep_islease_granted(env)) { 236 ret = 0; 237 goto lockdone; 238 } 239 240 /* 241 * If we are in the middle of recovering or internal 242 * init, we participate, but we set our priority to 0 243 * and turn off REPCTL_ELECTABLE. We *cannot* use the 244 * REP_F_RECOVER_MASK macro because we must explicitly 245 * exclude REP_F_RECOVER_VERIFY. If we are in verify 246 * then that is okay, we can be elected (i.e. we are not 247 * in an inconsistent state). 248 */ 249 if (F_ISSET(rep, REP_F_READY_API | REP_F_READY_OP | 250 REP_F_RECOVER_LOG | REP_F_RECOVER_PAGE | REP_F_RECOVER_UPDATE)) { 251 RPRINT(env, DB_VERB_REP_ELECT, (env, 252 "Setting priority 0, unelectable, due to internal init/recovery")); 253 priority = 0; 254 ctlflags = 0; 255 } 256 257 /* 258 * We are about to participate at this egen. We must 259 * write out the next egen before participating in this one 260 * so that if we crash we can never participate in this egen 261 * again. 262 */ 263 if ((ret = __rep_write_egen(env, rep->egen + 1)) != 0) 264 goto lockdone; 265 266 /* Tally our own vote */ 267 if (__rep_tally(env, rep, rep->eid, &rep->sites, rep->egen, 268 rep->tally_off) != 0) { 269 ret = EINVAL; 270 goto lockdone; 271 } 272 __rep_cmp_vote(env, rep, rep->eid, &lsn, priority, rep->gen, 273 tiebreaker, ctlflags); 274 275 RPRINT(env, DB_VERB_REP_ELECT, (env, "Beginning an election")); 276 277 /* Now send vote */ 278 send_vote = DB_EID_INVALID; 279 egen = rep->egen; 280 done = IS_PHASE1_DONE(rep); 281 REP_SYSTEM_UNLOCK(env); 282 __rep_send_vote(env, &lsn, nsites, ack, priority, tiebreaker, egen, 283 DB_EID_BROADCAST, REP_VOTE1, ctlflags); 284 DB_ENV_TEST_RECOVERY(env, DB_TEST_ELECTVOTE1, ret, NULL); 285 if (done) { 286 REP_SYSTEM_LOCK(env); 287 goto vote; 288 } 289 ret = __rep_wait(env, &to, &eid, full_elect, REP_F_EPHASE1); 290 switch (ret) { 291 case 0: 292 /* Check if election complete or phase complete. */ 293 if (eid != DB_EID_INVALID && !IN_ELECTION(rep)) { 294 RPRINT(env, DB_VERB_REP_ELECT, 295 (env, "Ended election phase 1")); 296 goto edone; 297 } 298 goto phase2; 299 case DB_REP_EGENCHG: 300 if (to > timeout) 301 to = timeout; 302 to = (to * 8) / 10; 303 RPRINT(env, DB_VERB_REP_ELECT, (env, 304"Egen changed while waiting. Now %lu. New timeout %lu, orig timeout %lu", 305 (u_long)rep->egen, (u_long)to, (u_long)timeout)); 306 /* 307 * If the egen changed while we were sleeping, that 308 * means we're probably late to the next election, 309 * so we'll backoff our timeout so that we don't get 310 * into an out-of-phase election scenario. 311 * 312 * Backoff to 80% of the current timeout. 313 */ 314 goto restart; 315 case DB_TIMEOUT: 316 break; 317 default: 318 goto err; 319 } 320 /* 321 * If we got here, we haven't heard from everyone, but we've 322 * run out of time, so it's time to decide if we have enough 323 * votes to pick a winner and if so, to send out a vote to 324 * the winner. 325 */ 326 REP_SYSTEM_LOCK(env); 327 /* 328 * If our egen changed while we were waiting. We need to 329 * essentially reinitialize our election. 330 */ 331 if (egen != rep->egen) { 332 REP_SYSTEM_UNLOCK(env); 333 RPRINT(env, DB_VERB_REP_ELECT, 334 (env, "Egen changed from %lu to %lu", 335 (u_long)egen, (u_long)rep->egen)); 336 goto restart; 337 } 338 if (rep->sites >= rep->nvotes) { 339vote: 340 /* We think we've seen enough to cast a vote. */ 341 send_vote = rep->winner; 342 /* 343 * See if we won. This will make sure we 344 * don't count ourselves twice if we're racing 345 * with incoming votes. 346 */ 347 if (rep->winner == rep->eid) { 348 (void)__rep_tally(env, rep, rep->eid, &rep->votes, 349 egen, rep->v2tally_off); 350 RPRINT(env, DB_VERB_REP_ELECT, (env, 351 "Counted my vote %d", rep->votes)); 352 } 353 F_SET(rep, REP_F_EPHASE2); 354 F_CLR(rep, REP_F_EPHASE1); 355 } 356 REP_SYSTEM_UNLOCK(env); 357 if (send_vote == DB_EID_INVALID) { 358 /* We do not have enough votes to elect. */ 359 if (rep->sites >= rep->nvotes) 360 __db_errx(env, 361 "No electable site found: recvd %d of %d votes from %d sites", 362 rep->sites, rep->nvotes, rep->nsites); 363 else 364 __db_errx(env, 365 "Not enough votes to elect: recvd %d of %d from %d sites", 366 rep->sites, rep->nvotes, rep->nsites); 367 ret = DB_REP_UNAVAIL; 368 goto err; 369 } 370 371 /* 372 * We have seen enough vote1's. Now we need to wait 373 * for all the vote2's. 374 */ 375 if (send_vote != rep->eid) { 376 RPRINT(env, DB_VERB_REP_ELECT, (env, "Sending vote")); 377 __rep_send_vote(env, NULL, 0, 0, 0, 0, egen, 378 send_vote, REP_VOTE2, 0); 379 /* 380 * If we are NOT the new master we want to send 381 * our vote to the winner, and wait longer. The 382 * reason is that the winner may be "behind" us 383 * in the election waiting and if the master is 384 * down, the winner will wait the full timeout 385 * and we want to give the winner enough time to 386 * process all the votes. Otherwise we could 387 * incorrectly return DB_REP_UNAVAIL and start a 388 * new election before the winner can declare 389 * itself. 390 */ 391 to = to * 2; 392 } 393 394phase2: 395 if (I_HAVE_WON(rep, rep->winner)) { 396 RPRINT(env, DB_VERB_REP_ELECT, (env, 397 "Skipping phase2 wait: already got %d votes", rep->votes)); 398 REP_SYSTEM_LOCK(env); 399 goto i_won; 400 } 401 ret = __rep_wait(env, &to, &eid, full_elect, REP_F_EPHASE2); 402 RPRINT(env, DB_VERB_REP_ELECT, 403 (env, "Ended election phase 2 %d", ret)); 404 switch (ret) { 405 case 0: 406 if (eid != DB_EID_INVALID) 407 goto edone; 408 ret = DB_REP_UNAVAIL; 409 break; 410 case DB_REP_EGENCHG: 411 if (to > timeout) 412 to = timeout; 413 to = (to * 8) / 10; 414 RPRINT(env, DB_VERB_REP_ELECT, (env, 415"While waiting egen changed to %lu. Phase 2 New timeout %lu, orig timeout %lu", 416 (u_long)rep->egen, 417 (u_long)to, (u_long)timeout)); 418 goto restart; 419 case DB_TIMEOUT: 420 ret = DB_REP_UNAVAIL; 421 break; 422 default: 423 goto err; 424 } 425 REP_SYSTEM_LOCK(env); 426 if (egen != rep->egen) { 427 REP_SYSTEM_UNLOCK(env); 428 RPRINT(env, DB_VERB_REP_ELECT, (env, 429 "Egen ph2 changed from %lu to %lu", 430 (u_long)egen, (u_long)rep->egen)); 431 goto restart; 432 } 433 done = rep->votes >= rep->nvotes; 434 RPRINT(env, DB_VERB_REP_ELECT, (env, 435 "After phase 2: votes %d, nvotes %d, nsites %d", 436 rep->votes, rep->nvotes, rep->nsites)); 437 if (I_HAVE_WON(rep, rep->winner)) { 438i_won: __rep_elect_master(env, rep); 439 ret = 0; 440 elected = 1; 441 } 442 if (0) { 443err: REP_SYSTEM_LOCK(env); 444 } 445lockdone: 446 /* 447 * If we get here because of a non-election error, then we 448 * did not tally our vote. The only non-election error is 449 * from elect_init where we were unable to grow_sites. In 450 * that case we do not want to discard all known election info. 451 */ 452 if (ret == 0 || ret == DB_REP_UNAVAIL) 453 __rep_elect_done(env, rep, 0); 454 else if (orig_tally) 455 F_SET(rep, orig_tally); 456 457 /* 458 * If the election finished elsewhere, we need to clear 459 * the elect flag anyway. 460 */ 461 if (0) { 462edone: REP_SYSTEM_LOCK(env); 463 } 464 F_CLR(rep, REP_F_INREPELECT); 465 if (locked) { 466 need_req = F_ISSET(rep, REP_F_SKIPPED_APPLY); 467 F_CLR(rep, REP_F_READY_APPLY | REP_F_SKIPPED_APPLY); 468 REP_SYSTEM_UNLOCK(env); 469 /* 470 * If we skipped any log records, request them now. 471 */ 472 if (need_req && (t_ret = __rep_resend_req(env, 0)) != 0 && 473 ret == 0) 474 ret = t_ret; 475 } else 476 REP_SYSTEM_UNLOCK(env); 477 478 if (elected) 479 ret = __rep_fire_elected(env, rep, egen); 480 481 RPRINT(env, DB_VERB_REP_ELECT, (env, 482 "Ended election with %d, sites %d, egen %lu, flags 0x%lx", 483 ret, rep->sites, (u_long)rep->egen, (u_long)rep->flags)); 484 485DB_TEST_RECOVERY_LABEL 486 ENV_LEAVE(env, ip); 487 return (ret); 488} 489 490/* 491 * __rep_vote1 -- 492 * Handle incoming vote1 message on a client. 493 * 494 * PUBLIC: int __rep_vote1 __P((ENV *, __rep_control_args *, DBT *, int)); 495 */ 496int 497__rep_vote1(env, rp, rec, eid) 498 ENV *env; 499 __rep_control_args *rp; 500 DBT *rec; 501 int eid; 502{ 503 DBT data_dbt; 504 DB_LOG *dblp; 505 DB_LSN lsn; 506 DB_REP *db_rep; 507 LOG *lp; 508 REP *rep; 509 REP_OLD_VOTE_INFO *ovi; 510 __rep_egen_args egen_arg; 511 __rep_vote_info_args tmpvi, *vi; 512 u_int32_t egen; 513 int elected, master, ret; 514 u_int8_t buf[__REP_MAXMSG_SIZE]; 515 size_t len; 516 517 COMPQUIET(egen, 0); 518 519 elected = ret = 0; 520 db_rep = env->rep_handle; 521 rep = db_rep->region; 522 dblp = env->lg_handle; 523 lp = dblp->reginfo.primary; 524 525 if (F_ISSET(rep, REP_F_MASTER)) { 526 RPRINT(env, DB_VERB_REP_ELECT, (env, "Master received vote")); 527 LOG_SYSTEM_LOCK(env); 528 lsn = lp->lsn; 529 LOG_SYSTEM_UNLOCK(env); 530 (void)__rep_send_message(env, 531 DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0); 532 if (IS_USING_LEASES(env)) 533 ret = __rep_lease_refresh(env); 534 return (ret); 535 } 536 537 /* 538 * In 4.7 we changed to having fixed sized u_int32_t's from 539 * non-fixed 'int' fields in the vote structure. 540 */ 541 if (rp->rep_version < DB_REPVERSION_47) { 542 ovi = (REP_OLD_VOTE_INFO *)rec->data; 543 tmpvi.egen = ovi->egen; 544 tmpvi.nsites = (u_int32_t)ovi->nsites; 545 tmpvi.nvotes = (u_int32_t)ovi->nvotes; 546 tmpvi.priority = (u_int32_t)ovi->priority; 547 tmpvi.tiebreaker = ovi->tiebreaker; 548 } else 549 if ((ret = __rep_vote_info_unmarshal(env, 550 &tmpvi, rec->data, rec->size, NULL)) != 0) 551 return (ret); 552 vi = &tmpvi; 553 REP_SYSTEM_LOCK(env); 554 555 /* 556 * If we get a vote from a later election gen, we 557 * clear everything from the current one, and we'll 558 * start over by tallying it. If we get an old vote, 559 * send an ALIVE to the old participant. 560 */ 561 RPRINT(env, DB_VERB_REP_ELECT, 562 (env, "Received vote1 egen %lu, egen %lu", 563 (u_long)vi->egen, (u_long)rep->egen)); 564 if (vi->egen < rep->egen) { 565 RPRINT(env, DB_VERB_REP_ELECT, (env, 566 "Received old vote %lu, egen %lu, ignoring vote1", 567 (u_long)vi->egen, (u_long)rep->egen)); 568 egen_arg.egen = rep->egen; 569 REP_SYSTEM_UNLOCK(env); 570 if (rep->version < DB_REPVERSION_47) 571 DB_INIT_DBT(data_dbt, &egen_arg.egen, 572 sizeof(egen_arg.egen)); 573 else { 574 if ((ret = __rep_egen_marshal(env, 575 &egen_arg, buf, __REP_EGEN_SIZE, &len)) != 0) 576 return (ret); 577 DB_INIT_DBT(data_dbt, buf, len); 578 } 579 (void)__rep_send_message(env, 580 eid, REP_ALIVE, &rp->lsn, &data_dbt, 0, 0); 581 return (ret); 582 } 583 if (vi->egen > rep->egen) { 584 RPRINT(env, DB_VERB_REP_ELECT, (env, 585 "Received VOTE1 from egen %lu, my egen %lu; reset", 586 (u_long)vi->egen, (u_long)rep->egen)); 587 __rep_elect_done(env, rep, 0); 588 rep->egen = vi->egen; 589 F_SET(rep, REP_F_EGENUPDATE); 590 } 591 592 /* 593 * If this site (sender of the VOTE1) is the first to the party, simply 594 * initialize values from the message. Otherwise, see if the site knows 595 * about more sites, and/or requires more votes, than we do. 596 */ 597 if (!IN_ELECTION_TALLY(rep)) { 598 F_SET(rep, REP_F_TALLY); 599 rep->nsites = vi->nsites; 600 rep->nvotes = vi->nvotes; 601 } else { 602 if (vi->nsites > rep->nsites) 603 rep->nsites = vi->nsites; 604 if (vi->nvotes > rep->nvotes) 605 rep->nvotes = vi->nvotes; 606 } 607 608 /* 609 * We are keeping the vote, let's see if that changes our 610 * count of the number of sites. 611 */ 612 if (rep->sites + 1 > rep->nsites) 613 rep->nsites = rep->sites + 1; 614 if (rep->nsites > rep->asites && 615 (ret = __rep_grow_sites(env, rep->nsites)) != 0) { 616 RPRINT(env, DB_VERB_REP_ELECT, (env, 617 "Grow sites returned error %d", ret)); 618 goto err; 619 } 620 621 /* 622 * Ignore vote1's if we're in phase 2. 623 */ 624 if (F_ISSET(rep, REP_F_EPHASE2)) { 625 RPRINT(env, DB_VERB_REP_ELECT, 626 (env, "In phase 2, ignoring vote1")); 627 goto err; 628 } 629 630 /* 631 * Record this vote. If we get back non-zero, we 632 * ignore the vote. 633 */ 634 if ((ret = __rep_tally(env, rep, eid, &rep->sites, 635 vi->egen, rep->tally_off)) != 0) { 636 RPRINT(env, DB_VERB_REP_ELECT, 637 (env, "Tally returned %d, sites %d", ret, rep->sites)); 638 ret = 0; 639 goto err; 640 } 641 RPRINT(env, DB_VERB_REP_ELECT, (env, 642 "Incoming vote: (eid)%d (pri)%lu %s (gen)%lu (egen)%lu [%lu,%lu]", 643 eid, (u_long)vi->priority, 644 F_ISSET(rp, REPCTL_ELECTABLE) ? "ELECTABLE" : "", 645 (u_long)rp->gen, (u_long)vi->egen, 646 (u_long)rp->lsn.file, (u_long)rp->lsn.offset)); 647 if (rep->sites > 1) 648 RPRINT(env, DB_VERB_REP_ELECT, (env, 649 "Existing vote: (eid)%d (pri)%lu (gen)%lu (sites)%d [%lu,%lu]", 650 rep->winner, (u_long)rep->w_priority, 651 (u_long)rep->w_gen, rep->sites, 652 (u_long)rep->w_lsn.file, 653 (u_long)rep->w_lsn.offset)); 654 655 __rep_cmp_vote(env, rep, eid, &rp->lsn, vi->priority, 656 rp->gen, vi->tiebreaker, rp->flags); 657 /* 658 * If you get a vote and you're not in an election, we've 659 * already recorded this vote. But that is all we need 660 * to do. 661 */ 662 if (!IN_ELECTION(rep)) { 663 RPRINT(env, DB_VERB_REP_ELECT, (env, 664 "Not in election, but received vote1 0x%x", rep->flags)); 665 ret = DB_REP_HOLDELECTION; 666 goto err; 667 } 668 669 master = rep->winner; 670 lsn = rep->w_lsn; 671 if (IS_PHASE1_DONE(rep)) { 672 RPRINT(env, DB_VERB_REP_ELECT, (env, "Phase1 election done")); 673 RPRINT(env, DB_VERB_REP_ELECT, (env, "Voting for %d%s", 674 master, master == rep->eid ? "(self)" : "")); 675 egen = rep->egen; 676 F_SET(rep, REP_F_EPHASE2); 677 F_CLR(rep, REP_F_EPHASE1); 678 if (master == rep->eid) { 679 (void)__rep_tally(env, rep, rep->eid, 680 &rep->votes, egen, rep->v2tally_off); 681 RPRINT(env, DB_VERB_REP_ELECT, (env, 682 "After phase 1 done: counted vote %d of %d", 683 rep->votes, rep->nvotes)); 684 if (I_HAVE_WON(rep, rep->winner)) { 685 __rep_elect_master(env, rep); 686 elected = 1; 687 } 688 goto err; 689 } 690 REP_SYSTEM_UNLOCK(env); 691 692 /* Vote for someone else. */ 693 __rep_send_vote(env, NULL, 0, 0, 0, 0, egen, 694 master, REP_VOTE2, 0); 695 } else 696err: REP_SYSTEM_UNLOCK(env); 697 if (elected) 698 ret = __rep_fire_elected(env, rep, egen); 699 return (ret); 700} 701 702/* 703 * __rep_vote2 -- 704 * Handle incoming vote2 message on a client. 705 * 706 * PUBLIC: int __rep_vote2 __P((ENV *, __rep_control_args *, DBT *, int)); 707 */ 708int 709__rep_vote2(env, rp, rec, eid) 710 ENV *env; 711 __rep_control_args *rp; 712 DBT *rec; 713 int eid; 714{ 715 DB_LOG *dblp; 716 DB_LSN lsn; 717 DB_REP *db_rep; 718 LOG *lp; 719 REP *rep; 720 REP_OLD_VOTE_INFO *ovi; 721 __rep_vote_info_args tmpvi, *vi; 722 u_int32_t egen; 723 int ret; 724 725 ret = 0; 726 db_rep = env->rep_handle; 727 rep = db_rep->region; 728 dblp = env->lg_handle; 729 lp = dblp->reginfo.primary; 730 731 RPRINT(env, DB_VERB_REP_ELECT, (env, "We received a vote%s", 732 F_ISSET(rep, REP_F_MASTER) ? " (master)" : "")); 733 if (F_ISSET(rep, REP_F_MASTER)) { 734 LOG_SYSTEM_LOCK(env); 735 lsn = lp->lsn; 736 LOG_SYSTEM_UNLOCK(env); 737 STAT(rep->stat.st_elections_won++); 738 (void)__rep_send_message(env, 739 DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0); 740 if (IS_USING_LEASES(env)) 741 ret = __rep_lease_refresh(env); 742 return (ret); 743 } 744 745 REP_SYSTEM_LOCK(env); 746 egen = rep->egen; 747 748 /* If we have priority 0, we should never get a vote. */ 749 DB_ASSERT(env, rep->priority != 0); 750 751 /* 752 * We might be the last to the party and we haven't had 753 * time to tally all the vote1's, but others have and 754 * decided we're the winner. So, if we're in the process 755 * of tallying sites, keep the vote so that when our 756 * election thread catches up we'll have the votes we 757 * already received. 758 */ 759 /* 760 * In 4.7 we changed to having fixed sized u_int32_t's from 761 * non-fixed 'int' fields in the vote structure. 762 */ 763 if (rp->rep_version < DB_REPVERSION_47) { 764 ovi = (REP_OLD_VOTE_INFO *)rec->data; 765 tmpvi.egen = ovi->egen; 766 tmpvi.nsites = (u_int32_t)ovi->nsites; 767 tmpvi.nvotes = (u_int32_t)ovi->nvotes; 768 tmpvi.priority = (u_int32_t)ovi->priority; 769 tmpvi.tiebreaker = ovi->tiebreaker; 770 } else 771 if ((ret = __rep_vote_info_unmarshal(env, 772 &tmpvi, rec->data, rec->size, NULL)) != 0) 773 return (ret); 774 vi = &tmpvi; 775 if (!IN_ELECTION_TALLY(rep) && vi->egen >= rep->egen) { 776 RPRINT(env, DB_VERB_REP_ELECT, (env, 777 "Not in election gen %lu, at %lu, got vote", 778 (u_long)vi->egen, (u_long)rep->egen)); 779 ret = DB_REP_HOLDELECTION; 780 goto err; 781 } 782 783 /* 784 * Record this vote. In a VOTE2, the only valid entry 785 * in the vote information is the election generation. 786 * 787 * There are several things which can go wrong that we 788 * need to account for: 789 * 1. If we receive a latent VOTE2 from an earlier election, 790 * we want to ignore it. 791 * 2. If we receive a VOTE2 from a site from which we never 792 * received a VOTE1, we want to record it, because we simply 793 * may be processing messages out of order or its vote1 got lost, 794 * but that site got all the votes it needed to send it. 795 * 3. If we have received a duplicate VOTE2 from this election 796 * from the same site we want to ignore it. 797 * 4. If this is from the current election and someone is 798 * really voting for us, then we finally get to record it. 799 */ 800 /* 801 * Case 1. 802 */ 803 if (vi->egen != rep->egen) { 804 RPRINT(env, DB_VERB_REP_ELECT, 805 (env, "Bad vote egen %lu. Mine %lu", 806 (u_long)vi->egen, (u_long)rep->egen)); 807 ret = 0; 808 goto err; 809 } 810 811 /* 812 * __rep_tally takes care of cases 2, 3 and 4. 813 */ 814 if ((ret = __rep_tally(env, rep, eid, &rep->votes, 815 vi->egen, rep->v2tally_off)) != 0) { 816 ret = 0; 817 goto err; 818 } 819 RPRINT(env, DB_VERB_REP_ELECT, (env, "Counted vote %d of %d", 820 rep->votes, rep->nvotes)); 821 if (I_HAVE_WON(rep, rep->winner)) { 822 __rep_elect_master(env, rep); 823 ret = DB_REP_NEWMASTER; 824 } 825 826err: REP_SYSTEM_UNLOCK(env); 827 if (ret == DB_REP_NEWMASTER) 828 ret = __rep_fire_elected(env, rep, egen); 829 return (ret); 830} 831 832/* 833 * __rep_tally -- 834 * Handle incoming vote message on a client. Called with the db_rep 835 * mutex held. This function will return 0 if we successfully tally 836 * the vote and non-zero if the vote is ignored. This will record 837 * both VOTE1 and VOTE2 records, depending on which region offset the 838 * caller passed in. 839 */ 840static int 841__rep_tally(env, rep, eid, countp, egen, vtoff) 842 ENV *env; 843 REP *rep; 844 int eid; 845 u_int32_t *countp; 846 u_int32_t egen; 847 roff_t vtoff; 848{ 849 REP_VTALLY *tally, *vtp; 850 u_int32_t i; 851 852 tally = R_ADDR(env->reginfo, vtoff); 853 vtp = &tally[0]; 854 for (i = 0; i < *countp;) { 855 /* 856 * Ignore votes from earlier elections (i.e. we've heard 857 * from this site in this election, but its vote from an 858 * earlier election got delayed and we received it now). 859 * However, if we happened to hear from an earlier vote 860 * and we recorded it and we're now hearing from a later 861 * election we want to keep the updated one. Note that 862 * updating the entry will not increase the count. 863 * Also ignore votes that are duplicates. 864 */ 865 if (vtp->eid == eid) { 866 RPRINT(env, DB_VERB_REP_ELECT, (env, 867 "Tally found[%d] (%d, %lu), this vote (%d, %lu)", 868 i, vtp->eid, (u_long)vtp->egen, 869 eid, (u_long)egen)); 870 if (vtp->egen >= egen) 871 return (1); 872 else { 873 vtp->egen = egen; 874 return (0); 875 } 876 } 877 i++; 878 vtp = &tally[i]; 879 } 880 881 /* 882 * If we get here, we have a new voter we haven't seen before. Tally 883 * this vote. 884 */ 885 RPRINT(env, DB_VERB_REP_ELECT, (env, "Tallying VOTE%c[%d] (%d, %lu)", 886 vtoff == rep->tally_off ? '1' : '2', i, eid, (u_long)egen)); 887 888 vtp->eid = eid; 889 vtp->egen = egen; 890 (*countp)++; 891 return (0); 892} 893 894/* 895 * __rep_cmp_vote -- 896 * Compare incoming vote1 message on a client. Called with the db_rep 897 * mutex held. 898 * 899 */ 900static void 901__rep_cmp_vote(env, rep, eid, lsnp, priority, gen, tiebreaker, flags) 902 ENV *env; 903 REP *rep; 904 int eid; 905 DB_LSN *lsnp; 906 u_int32_t priority; 907 u_int32_t flags, gen, tiebreaker; 908{ 909 int cmp; 910 911 cmp = LOG_COMPARE(lsnp, &rep->w_lsn); 912 /* 913 * If we've seen more than one, compare us to the best so far. 914 * If we're the first, make ourselves the winner to start. 915 */ 916 if (rep->sites > 1 && 917 (priority != 0 || LF_ISSET(REPCTL_ELECTABLE))) { 918 /* 919 * Special case, if we have a mixed version group of sites, 920 * we set priority to 0, but set the ELECTABLE flag so that 921 * all sites talking at lower versions can correctly elect. 922 * If a non-zero priority comes in and current winner is 923 * zero priority (but was electable), then the non-zero 924 * site takes precedence no matter what its LSN is. 925 * 926 * Then LSN is determinant only if we're comparing 927 * like-styled version/priorities. I.e. both with 928 * 0/ELECTABLE priority or both with non-zero priority. 929 * Then actual priority value if LSNs 930 * are equal, then tiebreaker if both are equal. 931 */ 932 if ((priority != 0 && rep->w_priority == 0) || 933 (((priority == 0 && rep->w_priority == 0) || 934 (priority != 0 && rep->w_priority != 0)) && cmp > 0) || 935 (cmp == 0 && (priority > rep->w_priority || 936 (priority == rep->w_priority && 937 (tiebreaker > rep->w_tiebreaker))))) { 938 RPRINT(env, DB_VERB_REP_ELECT, 939 (env, "Accepting new vote")); 940 rep->winner = eid; 941 rep->w_priority = priority; 942 rep->w_lsn = *lsnp; 943 rep->w_gen = gen; 944 rep->w_tiebreaker = tiebreaker; 945 } 946 } else if (rep->sites == 1) { 947 if (priority != 0 || LF_ISSET(REPCTL_ELECTABLE)) { 948 /* Make ourselves the winner to start. */ 949 rep->winner = eid; 950 rep->w_priority = priority; 951 rep->w_gen = gen; 952 rep->w_lsn = *lsnp; 953 rep->w_tiebreaker = tiebreaker; 954 } else { 955 rep->winner = DB_EID_INVALID; 956 rep->w_priority = 0; 957 rep->w_gen = 0; 958 ZERO_LSN(rep->w_lsn); 959 rep->w_tiebreaker = 0; 960 } 961 } 962} 963 964/* 965 * __rep_elect_init 966 * Initialize an election. Sets beginp non-zero if the election is 967 * already in progress; makes it 0 otherwise. 968 */ 969static int 970__rep_elect_init(env, nsites, nvotes, beginp, otally) 971 ENV *env; 972 u_int32_t nsites, nvotes; 973 int *beginp; 974 u_int32_t *otally; 975{ 976 DB_LOG *dblp; 977 DB_LSN lsn; 978 DB_REP *db_rep; 979 LOG *lp; 980 REP *rep; 981 int ret; 982 983 db_rep = env->rep_handle; 984 rep = db_rep->region; 985 986 ret = 0; 987 988 /* We may miscount, as we don't hold the replication mutex here. */ 989 STAT(rep->stat.st_elections++); 990 991 /* If we are already master; simply broadcast that fact and return. */ 992 if (F_ISSET(rep, REP_F_MASTER)) { 993 dblp = env->lg_handle; 994 lp = dblp->reginfo.primary; 995 LOG_SYSTEM_LOCK(env); 996 lsn = lp->lsn; 997 LOG_SYSTEM_UNLOCK(env); 998 (void)__rep_send_message(env, 999 DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0); 1000 if (IS_USING_LEASES(env)) 1001 ret = __rep_lease_refresh(env); 1002 STAT(rep->stat.st_elections_won++); 1003 return (DB_REP_NEWMASTER); 1004 } 1005 1006 REP_SYSTEM_LOCK(env); 1007 if (otally != NULL) 1008 *otally = F_ISSET(rep, REP_F_TALLY); 1009 *beginp = IN_ELECTION(rep) || F_ISSET(rep, REP_F_INREPELECT); 1010 if (!*beginp) { 1011 /* 1012 * Make sure that we always initialize all the election fields 1013 * before putting ourselves in an election state. That means 1014 * issuing calls that can fail (allocation) before setting all 1015 * the variables. 1016 */ 1017 if (nsites > rep->asites && 1018 (ret = __rep_grow_sites(env, nsites)) != 0) 1019 goto err; 1020 DB_ENV_TEST_RECOVERY(env, DB_TEST_ELECTINIT, ret, NULL); 1021 F_SET(rep, REP_F_INREPELECT); 1022 F_CLR(rep, REP_F_EGENUPDATE); 1023 /* 1024 * If we're the first to the party, we simply set initial 1025 * values: pre-existing values would be left over from previous 1026 * election. 1027 */ 1028 if (!IN_ELECTION_TALLY(rep)) { 1029 rep->nsites = nsites; 1030 rep->nvotes = nvotes; 1031 } else { 1032 if (nsites > rep->nsites) 1033 rep->nsites = nsites; 1034 if (nvotes > rep->nvotes) 1035 rep->nvotes = nvotes; 1036 } 1037 } 1038DB_TEST_RECOVERY_LABEL 1039err: REP_SYSTEM_UNLOCK(env); 1040 return (ret); 1041} 1042 1043/* 1044 * __rep_elect_master 1045 * Set up for new master from election. Must be called with 1046 * the replication region mutex held. 1047 */ 1048static void 1049__rep_elect_master(env, rep) 1050 ENV *env; 1051 REP *rep; 1052{ 1053 /* 1054 * We often come through here twice, sometimes even more. We mustn't 1055 * let the redundant calls affect stats counting. But rep_elect relies 1056 * on this first part for setting eidp. 1057 */ 1058 rep->master_id = rep->eid; 1059 1060 if (F_ISSET(rep, REP_F_MASTERELECT | REP_F_MASTER)) { 1061 /* We've been through here already; avoid double counting. */ 1062 return; 1063 } 1064 1065 F_SET(rep, REP_F_MASTERELECT); 1066 STAT(rep->stat.st_elections_won++); 1067 1068 RPRINT(env, DB_VERB_REP_ELECT, (env, 1069 "Got enough votes to win; election done; winner is %d, gen %lu", 1070 rep->master_id, (u_long)rep->gen)); 1071} 1072 1073static int 1074__rep_fire_elected(env, rep, egen) 1075 ENV *env; 1076 REP *rep; 1077 u_int32_t egen; 1078{ 1079 REP_EVENT_LOCK(env); 1080 if (rep->notified_egen < egen) { 1081 __rep_fire_event(env, DB_EVENT_REP_ELECTED, NULL); 1082 rep->notified_egen = egen; 1083 } 1084 REP_EVENT_UNLOCK(env); 1085 return (0); 1086} 1087 1088/* 1089 * Compute a sleep interval. Set it to the smaller of .5s or 1090 * timeout/10, making sure we sleep at least 1usec if timeout < 10. 1091 */ 1092#define SLEEPTIME(timeout) \ 1093 (timeout > 5000000) ? 500000 : ((timeout >= 10) ? timeout / 10 : 1); 1094 1095static int 1096__rep_wait(env, timeoutp, eidp, full_elect, flags) 1097 ENV *env; 1098 db_timeout_t *timeoutp; 1099 int *eidp, full_elect; 1100 u_int32_t flags; 1101{ 1102 DB_REP *db_rep; 1103 REP *rep; 1104 int done, echg, phase_over, ret; 1105 u_int32_t egen, sleeptime, sleeptotal, timeout; 1106 1107 db_rep = env->rep_handle; 1108 rep = db_rep->region; 1109 egen = rep->egen; 1110 done = echg = phase_over = ret = 0; 1111 1112 timeout = *timeoutp; 1113 /* 1114 * The user specifies an overall timeout function, but checking 1115 * is cheap and the timeout may be a generous upper bound. 1116 * Sleep repeatedly for the smaller of .5s and timeout/10. 1117 */ 1118 sleeptime = SLEEPTIME(timeout); 1119 sleeptotal = 0; 1120 while (sleeptotal < timeout) { 1121 __os_yield(env, 0, sleeptime); 1122 sleeptotal += sleeptime; 1123 REP_SYSTEM_LOCK(env); 1124 /* 1125 * Check if group membership changed while we were 1126 * sleeping. Specifically we're trying for a full 1127 * election and someone is telling us we're joining 1128 * a previously established replication group. 1129 */ 1130 if (full_elect && F_ISSET(rep, REP_F_GROUP_ESTD)) { 1131 *timeoutp = rep->elect_timeout; 1132 timeout = *timeoutp; 1133 /* 1134 * We adjusted timeout, if we've already waited 1135 * that long, then return as though this phase 1136 * timed out. However, we want to give other 1137 * changes a chance to return, so if we both 1138 * found a group and found a new egen, we 1139 * override this return with the egen information. 1140 * If we found a group and our election finished 1141 * then we want to return the election completion. 1142 */ 1143 if (sleeptotal >= timeout) { 1144 done = 1; 1145 ret = DB_TIMEOUT; 1146 } else 1147 sleeptime = SLEEPTIME(timeout); 1148 } 1149 1150 echg = egen != rep->egen; 1151 phase_over = !F_ISSET(rep, flags); 1152 1153 /* 1154 * Since we're not clearing out master_id any more, 1155 * we need to do more to detect the difference between 1156 * a new master getting elected and egen changing, 1157 * or a new election starting because the old one 1158 * timed out at another site (which easily happens 1159 * when sites have very different timeout settings). 1160 * 1161 * Detect this by: 1162 * If my phase was over, egen has changed but 1163 * there are still election flags set, or we're 1164 * told our egen was out of date and updated 1165 * then return DB_REP_EGENCHG. 1166 * 1167 * Otherwise, if my phase is over I want to 1168 * set my idea of the master and return. 1169 */ 1170 if (phase_over && echg && 1171 (IN_ELECTION_TALLY(rep) || 1172 F_ISSET(rep, REP_F_EGENUPDATE))) { 1173 done = 1; 1174 F_CLR(rep, REP_F_EGENUPDATE); 1175 ret = DB_REP_EGENCHG; 1176 } else if (phase_over) { 1177 *eidp = rep->master_id; 1178 done = 1; 1179 ret = 0; 1180 } 1181 REP_SYSTEM_UNLOCK(env); 1182 1183 if (done) 1184 return (ret); 1185 } 1186 return (DB_TIMEOUT); 1187} 1188