1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 2004,2008 Oracle. All rights reserved. 5 * 6 * $Id: rep_verify.c,v 12.69 2008/03/13 16:21:05 mbrey Exp $ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12#include "dbinc/db_page.h" 13#include "dbinc/db_am.h" 14#include "dbinc/log.h" 15#include "dbinc/txn.h" 16 17static int __rep_dorecovery __P((ENV *, DB_LSN *, DB_LSN *)); 18 19/* 20 * __rep_verify -- 21 * Handle a REP_VERIFY message. 22 * 23 * PUBLIC: int __rep_verify __P((ENV *, __rep_control_args *, DBT *, 24 * PUBLIC: int, time_t)); 25 */ 26int 27__rep_verify(env, rp, rec, eid, savetime) 28 ENV *env; 29 __rep_control_args *rp; 30 DBT *rec; 31 int eid; 32 time_t savetime; 33{ 34 DBT mylog; 35 DB_LOG *dblp; 36 DB_LOGC *logc; 37 DB_LSN lsn; 38 DB_REP *db_rep; 39 LOG *lp; 40 REP *rep; 41 u_int32_t rectype, logflag; 42 int match, ret, t_ret; 43 44 ret = 0; 45 db_rep = env->rep_handle; 46 rep = db_rep->region; 47 dblp = env->lg_handle; 48 lp = dblp->reginfo.primary; 49 50 /* Do nothing if VERIFY flag is not set. */ 51 if (!F_ISSET(rep, REP_F_RECOVER_VERIFY)) 52 return (ret); 53 54#ifdef DIAGNOSTIC 55 /* 56 * We should not ever be in internal init with a lease granted. 57 */ 58 if (IS_USING_LEASES(env)) { 59 REP_SYSTEM_LOCK(env); 60 DB_ASSERT(env, __rep_islease_granted(env) == 0); 61 REP_SYSTEM_UNLOCK(env); 62 } 63#endif 64 65 if ((ret = __log_cursor(env, &logc)) != 0) 66 return (ret); 67 memset(&mylog, 0, sizeof(mylog)); 68 /* If verify_lsn of ZERO is passed in, get last log. */ 69 MUTEX_LOCK(env, rep->mtx_clientdb); 70 logflag = IS_ZERO_LSN(lp->verify_lsn) ? DB_LAST : DB_SET; 71 MUTEX_UNLOCK(env, rep->mtx_clientdb); 72 if ((ret = __logc_get(logc, &rp->lsn, &mylog, logflag)) != 0) 73 goto err; 74 match = 0; 75 LOGCOPY_32(env, &rectype, mylog.data); 76 if (mylog.size == rec->size && 77 memcmp(mylog.data, rec->data, rec->size) == 0) 78 match = 1; 79 /* 80 * If we don't have a match, backup to the previous 81 * identification record and try again. 82 */ 83 if (match == 0) { 84 ZERO_LSN(lsn); 85 if ((ret = __rep_log_backup(env, rep, logc, &lsn)) == 0) { 86 MUTEX_LOCK(env, rep->mtx_clientdb); 87 lp->verify_lsn = lsn; 88 __os_gettime(env, &lp->rcvd_ts, 1); 89 lp->wait_ts = rep->request_gap; 90 MUTEX_UNLOCK(env, rep->mtx_clientdb); 91 (void)__rep_send_message(env, eid, REP_VERIFY_REQ, 92 &lsn, NULL, 0, DB_REP_ANYWHERE); 93 } else if (ret == DB_NOTFOUND) { 94 /* 95 * We've either run out of records because 96 * logs have been removed or we've rolled back 97 * all the way to the beginning. 98 */ 99 STAT(rep->stat.st_outdated++); 100 REP_SYSTEM_LOCK(env); 101 if (FLD_ISSET(rep->config, REP_C_NOAUTOINIT)) 102 ret = DB_REP_JOIN_FAILURE; 103 else { 104 F_CLR(rep, REP_F_RECOVER_VERIFY); 105 F_SET(rep, REP_F_RECOVER_UPDATE); 106 ZERO_LSN(rep->first_lsn); 107 ZERO_LSN(rep->ckp_lsn); 108 ret = 0; 109 } 110 REP_SYSTEM_UNLOCK(env); 111 if (ret == 0) 112 (void)__rep_send_message(env, 113 eid, REP_UPDATE_REQ, NULL, 114 NULL, 0, 0); 115 } 116 } else 117 ret = __rep_verify_match(env, &rp->lsn, savetime); 118 119err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0) 120 ret = t_ret; 121 return (ret); 122} 123 124/* 125 * __rep_verify_fail -- 126 * Handle a REP_VERIFY_FAIL message. 127 * 128 * PUBLIC: int __rep_verify_fail __P((ENV *, __rep_control_args *, int)); 129 */ 130int 131__rep_verify_fail(env, rp, eid) 132 ENV *env; 133 __rep_control_args *rp; 134 int eid; 135{ 136 DB_LOG *dblp; 137 DB_REP *db_rep; 138 LOG *lp; 139 REP *rep; 140 int lockout, ret; 141 142 lockout = 0; 143 ret = 0; 144 db_rep = env->rep_handle; 145 rep = db_rep->region; 146 dblp = env->lg_handle; 147 lp = dblp->reginfo.primary; 148 149 /* 150 * If any recovery flags are set, but not LOG or VERIFY, 151 * then we ignore this message. We are already 152 * in the middle of updating. 153 */ 154 if (F_ISSET(rep, REP_F_RECOVER_MASK) && 155 !F_ISSET(rep, REP_F_RECOVER_LOG | REP_F_RECOVER_VERIFY)) 156 return (0); 157 MUTEX_LOCK(env, rep->mtx_clientdb); 158 REP_SYSTEM_LOCK(env); 159 /* 160 * We should not ever be in internal init with a lease granted. 161 */ 162 DB_ASSERT(env, 163 !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0); 164 165 /* 166 * Update stats. 167 */ 168 STAT(rep->stat.st_outdated++); 169 170 /* 171 * Clean up old internal init in progress if: 172 * REP_C_NOAUTOINIT is not configured and 173 * we are recovering LOG and this LSN is in the range we need. 174 */ 175 if (!FLD_ISSET(rep->config, REP_C_NOAUTOINIT) && 176 (F_ISSET(rep, REP_F_RECOVER_LOG) && 177 LOG_COMPARE(&rep->first_lsn, &rp->lsn) <= 0 && 178 LOG_COMPARE(&rep->last_lsn, &rp->lsn) >= 0)) { 179 /* 180 * Already locking out messages, give up. 181 */ 182 if (F_ISSET(rep, REP_F_READY_MSG)) 183 goto unlock; 184 185 /* 186 * Lock out other messages to prevent race conditions. 187 */ 188 if ((ret = __rep_lockout_msg(env, rep, 1)) != 0) 189 goto unlock; 190 lockout = 1; 191 192 /* 193 * Clean up internal init if one was in progress. 194 */ 195 if (F_ISSET(rep, REP_F_READY_API | REP_F_READY_OP)) { 196 RPRINT(env, DB_VERB_REP_SYNC, (env, 197 "VERIFY_FAIL is cleaning up old internal init for missing log")); 198 if ((ret = 199 __rep_init_cleanup(env, rep, DB_FORCE)) != 0) { 200 RPRINT(env, DB_VERB_REP_SYNC, (env, 201 "VERIFY_FAIL error cleaning up internal init for missing log: %d", ret)); 202 goto msglck; 203 } 204 F_CLR(rep, REP_F_RECOVER_MASK); 205 } 206 F_CLR(rep, REP_F_READY_MSG); 207 lockout = 0; 208 } 209 210 /* 211 * Commence an internal init if: 212 * We are in VERIFY state and the failing LSN is the one we 213 * were verifying or 214 * we're recovering LOG and this LSN is in the range we need or 215 * we are in normal state (no recovery flags set) and 216 * the failing LSN is the one we're ready for. 217 */ 218 if (((F_ISSET(rep, REP_F_RECOVER_VERIFY)) && 219 LOG_COMPARE(&rp->lsn, &lp->verify_lsn) == 0) || 220 (F_ISSET(rep, REP_F_RECOVER_LOG) && 221 LOG_COMPARE(&rep->first_lsn, &rp->lsn) <= 0 && 222 LOG_COMPARE(&rep->last_lsn, &rp->lsn) >= 0) || 223 (F_ISSET(rep, REP_F_RECOVER_MASK) == 0 && 224 LOG_COMPARE(&rp->lsn, &lp->ready_lsn) >= 0)) { 225 /* 226 * We don't want an old or delayed VERIFY_FAIL 227 * message to throw us into internal initialization 228 * when we shouldn't be. If REP_C_NOAUTOINIT is configured, 229 * return DB_REP_JOIN_FAILURE instead of doing internal init. 230 */ 231 if (FLD_ISSET(rep->config, REP_C_NOAUTOINIT)) { 232 ret = DB_REP_JOIN_FAILURE; 233 goto unlock; 234 } 235 236 /* 237 * Do the internal init. 238 */ 239 F_CLR(rep, REP_F_RECOVER_VERIFY); 240 F_SET(rep, REP_F_RECOVER_UPDATE); 241 ZERO_LSN(rep->first_lsn); 242 ZERO_LSN(rep->ckp_lsn); 243 lp->wait_ts = rep->request_gap; 244 REP_SYSTEM_UNLOCK(env); 245 MUTEX_UNLOCK(env, rep->mtx_clientdb); 246 (void)__rep_send_message(env, 247 eid, REP_UPDATE_REQ, NULL, NULL, 0, 0); 248 } else { 249 /* 250 * Otherwise ignore this message. 251 */ 252msglck: if (lockout) 253 F_CLR(rep, REP_F_READY_MSG); 254unlock: REP_SYSTEM_UNLOCK(env); 255 MUTEX_UNLOCK(env, rep->mtx_clientdb); 256 } 257 return (ret); 258} 259 260/* 261 * __rep_verify_req -- 262 * Handle a REP_VERIFY_REQ message. 263 * 264 * PUBLIC: int __rep_verify_req __P((ENV *, __rep_control_args *, int)); 265 */ 266int 267__rep_verify_req(env, rp, eid) 268 ENV *env; 269 __rep_control_args *rp; 270 int eid; 271{ 272 DBT *d, data_dbt; 273 DB_LOGC *logc; 274 DB_REP *db_rep; 275 REP *rep; 276 u_int32_t type; 277 int old, ret; 278 279 ret = 0; 280 db_rep = env->rep_handle; 281 rep = db_rep->region; 282 283 type = REP_VERIFY; 284 if ((ret = __log_cursor(env, &logc)) != 0) 285 return (ret); 286 d = &data_dbt; 287 memset(d, 0, sizeof(data_dbt)); 288 F_SET(logc, DB_LOG_SILENT_ERR); 289 ret = __logc_get(logc, &rp->lsn, d, DB_SET); 290 /* 291 * If the LSN was invalid, then we might get a DB_NOTFOUND 292 * we might get an EIO, we could get anything. 293 * If we get a DB_NOTFOUND, then there is a chance that 294 * the LSN comes before the first file present in which 295 * case we need to return a fail so that the client can 296 * perform an internal init or return a REP_JOIN_FAILURE. 297 * 298 * If we're a client servicing this request and we get a 299 * NOTFOUND, return it so the caller can rerequest from 300 * a better source. 301 */ 302 if (ret == DB_NOTFOUND) { 303 if (F_ISSET(rep, REP_F_CLIENT)) { 304 (void)__logc_close(logc); 305 return (DB_NOTFOUND); 306 } 307 if (__log_is_outdated(env, rp->lsn.file, &old) == 0 && 308 old != 0) 309 type = REP_VERIFY_FAIL; 310 } 311 312 if (ret != 0) 313 d = NULL; 314 315 (void)__rep_send_message(env, eid, type, &rp->lsn, d, 0, 0); 316 return (__logc_close(logc)); 317} 318 319static int 320__rep_dorecovery(env, lsnp, trunclsnp) 321 ENV *env; 322 DB_LSN *lsnp, *trunclsnp; 323{ 324 DBT mylog; 325 DB_LOGC *logc; 326 DB_LSN last_ckp, lsn; 327 DB_REP *db_rep; 328 DB_THREAD_INFO *ip; 329 REP *rep; 330 int ret, skip_rec, t_ret, update; 331 u_int32_t rectype, opcode; 332 __txn_regop_args *txnrec; 333 __txn_regop_42_args *txn42rec; 334 335 db_rep = env->rep_handle; 336 rep = db_rep->region; 337 ENV_GET_THREAD_INFO(env, ip); 338 339 /* Figure out if we are backing out any committed transactions. */ 340 if ((ret = __log_cursor(env, &logc)) != 0) 341 return (ret); 342 343 memset(&mylog, 0, sizeof(mylog)); 344 if (F_ISSET(rep, REP_F_RECOVER_LOG)) { 345 /* 346 * Internal init can never skip recovery. 347 * Internal init must always update the timestamp and 348 * force dead handles. 349 */ 350 skip_rec = 0; 351 update = 1; 352 } else { 353 skip_rec = 1; 354 update = 0; 355 } 356 while (update == 0 && 357 (ret = __logc_get(logc, &lsn, &mylog, DB_PREV)) == 0 && 358 LOG_COMPARE(&lsn, lsnp) > 0) { 359 LOGCOPY_32(env, &rectype, mylog.data); 360 /* 361 * Find out if we can skip recovery completely. If we 362 * are backing up over any record a client usually 363 * cares about, we must run recovery. 364 * 365 * Skipping sync-up recovery can be pretty scary! 366 * Here's why we can do it: 367 * If a master downgraded to client and is now running 368 * sync-up to a new master, that old master must have 369 * waited for any outstanding txns to resolve before 370 * becoming a client. Also we are in lockout so there 371 * can be no other operations right now. 372 * 373 * If the client wrote a commit record to the log, but 374 * was descheduled before processing the txn, and then 375 * a new master was found, we must've let the txn get 376 * processed because right now we are the only message 377 * thread allowed to be running. 378 */ 379 DB_ASSERT(env, rep->op_cnt == 0); 380 DB_ASSERT(env, rep->msg_th == 1); 381 if (rectype == DB___txn_regop || rectype == DB___txn_ckp || 382 rectype == DB___dbreg_register) 383 skip_rec = 0; 384 if (rectype == DB___txn_regop) { 385 if (rep->version >= DB_REPVERSION_44) { 386 if ((ret = __txn_regop_read( 387 env, mylog.data, &txnrec)) != 0) 388 goto err; 389 opcode = txnrec->opcode; 390 __os_free(env, txnrec); 391 } else { 392 if ((ret = __txn_regop_42_read( 393 env, mylog.data, &txn42rec)) != 0) 394 goto err; 395 opcode = txn42rec->opcode; 396 __os_free(env, txn42rec); 397 } 398 if (opcode != TXN_ABORT) 399 update = 1; 400 } 401 } 402 /* 403 * Handle if the logc_get fails. 404 */ 405 if (ret != 0) 406 goto err; 407 408 /* 409 * If we successfully run recovery, we've opened all the necessary 410 * files. We are guaranteed to be single-threaded here, so no mutex 411 * is necessary. 412 */ 413 if (skip_rec) { 414 if ((ret = __log_get_stable_lsn(env, &last_ckp)) != 0) { 415 if (ret != DB_NOTFOUND) 416 goto err; 417 ZERO_LSN(last_ckp); 418 } 419 RPRINT(env, DB_VERB_REP_SYNC, (env, 420 "Skip sync-up rec. Truncate log to [%lu][%lu], ckp [%lu][%lu]", 421 (u_long)lsnp->file, (u_long)lsnp->offset, 422 (u_long)last_ckp.file, (u_long)last_ckp.offset)); 423 ret = __log_vtruncate(env, lsnp, &last_ckp, trunclsnp); 424 } else 425 ret = __db_apprec(env, ip, lsnp, trunclsnp, update, 0); 426 427 if (ret != 0) 428 goto err; 429 F_SET(db_rep, DBREP_OPENFILES); 430 431err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0) 432 ret = t_ret; 433 434 return (ret); 435} 436 437/* 438 * __rep_verify_match -- 439 * We have just received a matching log record during verification. 440 * Figure out if we're going to need to run recovery. If so, wait until 441 * everything else has exited the library. If not, set up the world 442 * correctly and move forward. 443 * 444 * PUBLIC: int __rep_verify_match __P((ENV *, DB_LSN *, time_t)); 445 */ 446int 447__rep_verify_match(env, reclsnp, savetime) 448 ENV *env; 449 DB_LSN *reclsnp; 450 time_t savetime; 451{ 452 DB_LOG *dblp; 453 DB_LSN trunclsn; 454 DB_REP *db_rep; 455 DB_THREAD_INFO *ip; 456 LOG *lp; 457 REGENV *renv; 458 REGINFO *infop; 459 REP *rep; 460 int done, master, ret; 461 u_int32_t unused; 462 463 dblp = env->lg_handle; 464 db_rep = env->rep_handle; 465 rep = db_rep->region; 466 lp = dblp->reginfo.primary; 467 ret = 0; 468 infop = env->reginfo; 469 renv = infop->primary; 470 ENV_GET_THREAD_INFO(env, ip); 471 472 /* 473 * Check if the savetime is different than our current time stamp. 474 * If it is, then we're racing with another thread trying to recover 475 * and we lost. We must give up. 476 */ 477 MUTEX_LOCK(env, rep->mtx_clientdb); 478 done = savetime != renv->rep_timestamp; 479 if (done) { 480 MUTEX_UNLOCK(env, rep->mtx_clientdb); 481 return (0); 482 } 483 ZERO_LSN(lp->verify_lsn); 484 MUTEX_UNLOCK(env, rep->mtx_clientdb); 485 486 /* 487 * Make sure the world hasn't changed while we tried to get 488 * the lock. If it hasn't then it's time for us to kick all 489 * operations out of DB and run recovery. 490 */ 491 REP_SYSTEM_LOCK(env); 492 if (F_ISSET(rep, REP_F_READY_MSG) || 493 (!F_ISSET(rep, REP_F_RECOVER_LOG) && 494 F_ISSET(rep, REP_F_READY_API | REP_F_READY_OP))) { 495 /* 496 * We lost. The world changed and we should do nothing. 497 */ 498 STAT(rep->stat.st_msgs_recover++); 499 goto errunlock; 500 } 501 502 /* 503 * Lockout all message threads but ourselves. 504 */ 505 if ((ret = __rep_lockout_msg(env, rep, 1)) != 0) 506 goto errunlock; 507 508 /* 509 * Lockout the API and wait for operations to complete. 510 */ 511 if ((ret = __rep_lockout_api(env, rep)) != 0) 512 goto errunlock; 513 514 /* OK, everyone is out, we can now run recovery. */ 515 REP_SYSTEM_UNLOCK(env); 516 517 if ((ret = __rep_dorecovery(env, reclsnp, &trunclsn)) != 0 || 518 (ret = __rep_remove_init_file(env)) != 0) { 519 REP_SYSTEM_LOCK(env); 520 F_CLR(rep, REP_F_READY_API | REP_F_READY_MSG | REP_F_READY_OP); 521 goto errunlock; 522 } 523 524 /* 525 * The log has been truncated (either directly by us or by __db_apprec) 526 * We want to make sure we're waiting for the LSN at the new end-of-log, 527 * not some later point. 528 */ 529 MUTEX_LOCK(env, rep->mtx_clientdb); 530 lp->ready_lsn = trunclsn; 531 ZERO_LSN(lp->waiting_lsn); 532 ZERO_LSN(lp->max_wait_lsn); 533 lp->max_perm_lsn = *reclsnp; 534 lp->wait_ts = rep->request_gap; 535 __os_gettime(env, &lp->rcvd_ts, 1); 536 ZERO_LSN(lp->verify_lsn); 537 538 /* 539 * Discard any log records we have queued; we're about to re-request 540 * them, and can't trust the ones in the queue. We need to set the 541 * DB_AM_RECOVER bit in this handle, so that the operation doesn't 542 * deadlock. 543 */ 544 if (db_rep->rep_db == NULL && 545 (ret = __rep_client_dbinit(env, 0, REP_DB)) != 0) { 546 MUTEX_UNLOCK(env, rep->mtx_clientdb); 547 goto out; 548 } 549 550 F_SET(db_rep->rep_db, DB_AM_RECOVER); 551 MUTEX_UNLOCK(env, rep->mtx_clientdb); 552 ret = __db_truncate(db_rep->rep_db, ip, NULL, &unused); 553 MUTEX_LOCK(env, rep->mtx_clientdb); 554 F_CLR(db_rep->rep_db, DB_AM_RECOVER); 555 556 REP_SYSTEM_LOCK(env); 557 rep->stat.st_log_queued = 0; 558 F_CLR(rep, REP_F_NOARCHIVE | REP_F_RECOVER_MASK | REP_F_READY_MSG); 559 if (ret != 0) 560 goto errunlock2; 561 562 /* 563 * If the master_id is invalid, this means that since 564 * the last record was sent, something happened to the 565 * master and we may not have a master to request 566 * things of. 567 * 568 * This is not an error; when we find a new master, 569 * we'll re-negotiate where the end of the log is and 570 * try to bring ourselves up to date again anyway. 571 */ 572 master = rep->master_id; 573 REP_SYSTEM_UNLOCK(env); 574 if (master == DB_EID_INVALID) { 575 MUTEX_UNLOCK(env, rep->mtx_clientdb); 576 ret = 0; 577 } else { 578 /* 579 * We're making an ALL_REQ. But now that we've 580 * cleared the flags, we're likely receiving new 581 * log records from the master, resulting in a gap 582 * immediately. So to avoid multiple data streams, 583 * set the wait_ts value high now to give the master 584 * a chance to start sending us these records before 585 * the gap code re-requests the same gap. Wait_recs 586 * will get reset once we start receiving these 587 * records. 588 */ 589 lp->wait_ts = rep->max_gap; 590 MUTEX_UNLOCK(env, rep->mtx_clientdb); 591 (void)__rep_send_message(env, 592 master, REP_ALL_REQ, reclsnp, NULL, 0, DB_REP_ANYWHERE); 593 } 594 if (0) { 595errunlock2: MUTEX_UNLOCK(env, rep->mtx_clientdb); 596errunlock: REP_SYSTEM_UNLOCK(env); 597 } 598out: return (ret); 599} 600 601/* 602 * __rep_log_backup -- 603 * 604 * In the verify handshake, we walk backward looking for 605 * identification records. Those are the only record types 606 * we verify and match on. 607 * 608 * PUBLIC: int __rep_log_backup __P((ENV *, REP *, DB_LOGC *, DB_LSN *)); 609 */ 610int 611__rep_log_backup(env, rep, logc, lsn) 612 ENV *env; 613 REP *rep; 614 DB_LOGC *logc; 615 DB_LSN *lsn; 616{ 617 DBT mylog; 618 u_int32_t rectype; 619 int ret; 620 621 ret = 0; 622 memset(&mylog, 0, sizeof(mylog)); 623 while ((ret = __logc_get(logc, lsn, &mylog, DB_PREV)) == 0) { 624 /* 625 * Determine what we look for based on version number. 626 * Due to the contents of records changing between 627 * versions we have to match based on criteria of that 628 * particular version. 629 */ 630 LOGCOPY_32(env, &rectype, mylog.data); 631 /* 632 * In 4.4 and beyond we match checkpoint and commit. 633 */ 634 if (rep->version >= DB_REPVERSION_44 && 635 (rectype == DB___txn_ckp || rectype == DB___txn_regop)) 636 break; 637 } 638 return (ret); 639} 640