1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 */ 6/* 7 * Copyright (c) 1996 8 * The President and Fellows of Harvard University. All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $Id: txn_rec.c,v 12.29 2008/03/13 20:48:48 mbrey Exp $ 35 */ 36 37#include "db_config.h" 38 39#include "db_int.h" 40#include "dbinc/db_page.h" 41#include "dbinc/lock.h" 42#include "dbinc/txn.h" 43#include "dbinc/db_am.h" 44 45/* 46 * PUBLIC: int __txn_regop_recover 47 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 48 * 49 * These records are only ever written for commits. Normally, we redo any 50 * committed transaction, however if we are doing recovery to a timestamp, then 51 * we may treat transactions that committed after the timestamp as aborted. 52 */ 53int 54__txn_regop_recover(env, dbtp, lsnp, op, info) 55 ENV *env; 56 DBT *dbtp; 57 DB_LSN *lsnp; 58 db_recops op; 59 void *info; 60{ 61 __txn_regop_args *argp; 62 DB_TXNHEAD *headp; 63 int ret; 64 u_int32_t status; 65 66#ifdef DEBUG_RECOVER 67 (void)__txn_regop_print(env, dbtp, lsnp, op, info); 68#endif 69 70 if ((ret = __txn_regop_read(env, dbtp->data, &argp)) != 0) 71 return (ret); 72 73 headp = info; 74 /* 75 * We are only ever called during FORWARD_ROLL or BACKWARD_ROLL. 76 * We check for the former explicitly and the last two clauses 77 * apply to the BACKWARD_ROLL case. 78 */ 79 80 if (op == DB_TXN_FORWARD_ROLL) { 81 /* 82 * If this was a 2-phase-commit transaction, then it 83 * might already have been removed from the list, and 84 * that's OK. Ignore the return code from remove. 85 */ 86 if ((ret = __db_txnlist_remove(env, 87 info, argp->txnp->txnid)) != DB_NOTFOUND && ret != 0) 88 goto err; 89 } else if ((env->dbenv->tx_timestamp != 0 && 90 argp->timestamp > (int32_t)env->dbenv->tx_timestamp) || 91 (!IS_ZERO_LSN(headp->trunc_lsn) && 92 LOG_COMPARE(&headp->trunc_lsn, lsnp) < 0)) { 93 /* 94 * We failed either the timestamp check or the trunc_lsn check, 95 * so we treat this as an abort even if it was a commit record. 96 */ 97 if ((ret = __db_txnlist_update(env, info, 98 argp->txnp->txnid, TXN_ABORT, NULL, &status, 1)) != 0) 99 goto err; 100 else if (status != TXN_IGNORE && status != TXN_OK) 101 goto err; 102 } else { 103 /* This is a normal commit; mark it appropriately. */ 104 if ((ret = __db_txnlist_update(env, 105 info, argp->txnp->txnid, argp->opcode, lsnp, 106 &status, 0)) == DB_NOTFOUND) { 107 if ((ret = __db_txnlist_add(env, 108 info, argp->txnp->txnid, 109 argp->opcode == TXN_ABORT ? 110 TXN_IGNORE : argp->opcode, lsnp)) != 0) 111 goto err; 112 } else if (ret != 0 || 113 (status != TXN_IGNORE && status != TXN_OK)) 114 goto err; 115 } 116 117 if (ret == 0) 118 *lsnp = argp->prev_lsn; 119 120 if (0) { 121err: __db_errx(env, 122 "txnid %lx commit record found, already on commit list", 123 (u_long)argp->txnp->txnid); 124 ret = EINVAL; 125 } 126 __os_free(env, argp); 127 128 return (ret); 129} 130 131/* 132 * PUBLIC: int __txn_xa_regop_recover 133 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 134 * 135 * These records are only ever written for prepares. 136 */ 137int 138__txn_xa_regop_recover(env, dbtp, lsnp, op, info) 139 ENV *env; 140 DBT *dbtp; 141 DB_LSN *lsnp; 142 db_recops op; 143 void *info; 144{ 145 __txn_xa_regop_args *argp; 146 DBT *lock_dbt; 147 DB_TXNHEAD *headp; 148 DB_LOCKTAB *lt; 149 u_int32_t status; 150 int ret; 151 152#ifdef DEBUG_RECOVER 153 (void)__txn_xa_regop_print(env, dbtp, lsnp, op, info); 154#endif 155 156 if ((ret = __txn_xa_regop_read(env, dbtp->data, &argp)) != 0) 157 return (ret); 158 159 if (argp->opcode != TXN_PREPARE && argp->opcode != TXN_ABORT) { 160 ret = EINVAL; 161 goto err; 162 } 163 headp = info; 164 165 /* 166 * The return value here is either a DB_NOTFOUND or it is 167 * the transaction status from the list. It is not a normal 168 * error return, so we must make sure that in each of the 169 * cases below, we overwrite the ret value so we return 170 * appropriately. 171 */ 172 ret = __db_txnlist_find(env, info, argp->txnp->txnid, &status); 173 174 /* 175 * If we are rolling forward, then an aborted prepare 176 * indicates that this may be the last record we'll see for 177 * this transaction ID, so we should remove it from the list. 178 */ 179 180 if (op == DB_TXN_FORWARD_ROLL) { 181 if ((ret = __db_txnlist_remove(env, 182 info, argp->txnp->txnid)) != 0) 183 goto txn_err; 184 } else if (op == DB_TXN_BACKWARD_ROLL && status == TXN_PREPARE) { 185 /* 186 * On the backward pass, we have four possibilities: 187 * 1. The transaction is already committed, no-op. 188 * 2. The transaction is already aborted, no-op. 189 * 3. The prepare failed and was aborted, mark as abort. 190 * 4. The transaction is neither committed nor aborted. 191 * Treat this like a commit and roll forward so that 192 * the transaction can be resurrected in the region. 193 * We handle cases 3 and 4 here; cases 1 and 2 194 * are the final clause below. 195 */ 196 if (argp->opcode == TXN_ABORT) { 197 if ((ret = __db_txnlist_update(env, 198 info, argp->txnp->txnid, 199 TXN_ABORT, NULL, &status, 0)) != 0 && 200 status != TXN_PREPARE) 201 goto txn_err; 202 ret = 0; 203 } 204 /* 205 * This is prepared, but not yet committed transaction. We 206 * need to add it to the transaction list, so that it gets 207 * rolled forward. We also have to add it to the region's 208 * internal state so it can be properly aborted or committed 209 * after recovery (see txn_recover). 210 */ 211 else if ((ret = __db_txnlist_remove(env, 212 info, argp->txnp->txnid)) != 0) { 213txn_err: __db_errx(env, 214 "transaction not in list %lx", 215 (u_long)argp->txnp->txnid); 216 ret = DB_NOTFOUND; 217 } else if (IS_ZERO_LSN(headp->trunc_lsn) || 218 LOG_COMPARE(&headp->trunc_lsn, lsnp) >= 0) { 219 if ((ret = __db_txnlist_add(env, 220 info, argp->txnp->txnid, TXN_COMMIT, lsnp)) == 0) { 221 /* Re-acquire the locks for this transaction. */ 222 lock_dbt = &argp->locks; 223 if (LOCKING_ON(env)) { 224 lt = env->lk_handle; 225 if ((ret = __lock_getlocker(lt, 226 argp->txnp->txnid, 1, 227 &argp->txnp->locker)) != 0) 228 goto err; 229 if ((ret = __lock_get_list(env, 230 argp->txnp->locker, 0, 231 DB_LOCK_WRITE, lock_dbt)) != 0) 232 goto err; 233 } 234 235 ret = __txn_restore_txn(env, lsnp, argp); 236 } 237 } 238 } else 239 ret = 0; 240 241 if (ret == 0) 242 *lsnp = argp->prev_lsn; 243 244err: __os_free(env, argp); 245 246 return (ret); 247} 248 249/* 250 * PUBLIC: int __txn_ckp_recover 251 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 252 */ 253int 254__txn_ckp_recover(env, dbtp, lsnp, op, info) 255 ENV *env; 256 DBT *dbtp; 257 DB_LSN *lsnp; 258 db_recops op; 259 void *info; 260{ 261 __txn_ckp_args *argp; 262 int ret; 263 264#ifdef DEBUG_RECOVER 265 __txn_ckp_print(env, dbtp, lsnp, op, info); 266#endif 267 if ((ret = __txn_ckp_read(env, dbtp->data, &argp)) != 0) 268 return (ret); 269 270 if (op == DB_TXN_BACKWARD_ROLL) 271 __db_txnlist_ckp(env, info, lsnp); 272 273 *lsnp = argp->last_ckp; 274 __os_free(env, argp); 275 return (DB_TXN_CKP); 276} 277 278/* 279 * __txn_child_recover 280 * Recover a commit record for a child transaction. 281 * 282 * PUBLIC: int __txn_child_recover 283 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 284 */ 285int 286__txn_child_recover(env, dbtp, lsnp, op, info) 287 ENV *env; 288 DBT *dbtp; 289 DB_LSN *lsnp; 290 db_recops op; 291 void *info; 292{ 293 __txn_child_args *argp; 294 u_int32_t c_stat, p_stat, tmpstat; 295 int ret, t_ret; 296 297#ifdef DEBUG_RECOVER 298 (void)__txn_child_print(env, dbtp, lsnp, op, info); 299#endif 300 if ((ret = __txn_child_read(env, dbtp->data, &argp)) != 0) 301 return (ret); 302 303 /* 304 * This is a record in a PARENT's log trail indicating that a 305 * child committed. If we are aborting, return the childs last 306 * record's LSN. If we are in recovery, then if the 307 * parent is committing, we set ourselves up to commit, else 308 * we do nothing. 309 */ 310 if (op == DB_TXN_ABORT) { 311 *lsnp = argp->c_lsn; 312 ret = __db_txnlist_lsnadd(env, info, &argp->prev_lsn); 313 goto out; 314 } else if (op == DB_TXN_BACKWARD_ROLL) { 315 /* Child might exist -- look for it. */ 316 ret = __db_txnlist_find(env, info, argp->child, &c_stat); 317 t_ret = 318 __db_txnlist_find(env, info, argp->txnp->txnid, &p_stat); 319 if (ret != 0 && ret != DB_NOTFOUND) 320 goto out; 321 if (t_ret != 0 && t_ret != DB_NOTFOUND) { 322 ret = t_ret; 323 goto out; 324 } 325 /* 326 * If the parent is in state COMMIT or IGNORE, then we apply 327 * that to the child, else we need to abort the child. 328 */ 329 330 if (ret == DB_NOTFOUND || 331 c_stat == TXN_OK || c_stat == TXN_COMMIT) { 332 if (t_ret == DB_NOTFOUND || 333 (p_stat != TXN_COMMIT && p_stat != TXN_IGNORE)) 334 c_stat = TXN_ABORT; 335 else 336 c_stat = p_stat; 337 338 if (ret == DB_NOTFOUND) 339 ret = __db_txnlist_add(env, 340 info, argp->child, c_stat, NULL); 341 else 342 ret = __db_txnlist_update(env, info, 343 argp->child, c_stat, NULL, &tmpstat, 0); 344 } else if (c_stat == TXN_EXPECTED) { 345 /* 346 * The open after this create succeeded. If the 347 * parent succeeded, we don't want to redo; if the 348 * parent aborted, we do want to undo. 349 */ 350 switch (p_stat) { 351 case TXN_COMMIT: 352 case TXN_IGNORE: 353 c_stat = TXN_IGNORE; 354 break; 355 default: 356 c_stat = TXN_ABORT; 357 } 358 ret = __db_txnlist_update(env, 359 info, argp->child, c_stat, NULL, &tmpstat, 0); 360 } else if (c_stat == TXN_UNEXPECTED) { 361 /* 362 * The open after this create failed. If the parent 363 * is rolling forward, we need to roll forward. If 364 * the parent failed, then we do not want to abort 365 * (because the file may not be the one in which we 366 * are interested). 367 */ 368 ret = __db_txnlist_update(env, info, argp->child, 369 p_stat == TXN_COMMIT ? TXN_COMMIT : TXN_IGNORE, 370 NULL, &tmpstat, 0); 371 } 372 } else if (op == DB_TXN_OPENFILES) { 373 /* 374 * If we have a partial subtransaction, then the whole 375 * transaction should be ignored. 376 */ 377 if ((ret = __db_txnlist_find(env, 378 info, argp->child, &c_stat)) == DB_NOTFOUND) 379 ret = __db_txnlist_update(env, info, 380 argp->txnp->txnid, TXN_IGNORE, 381 NULL, &p_stat, 1); 382 } else if (DB_REDO(op)) { 383 /* Forward Roll */ 384 if ((ret = 385 __db_txnlist_remove(env, info, argp->child)) != 0) 386 __db_errx(env, 387 "Transaction not in list %x", argp->child); 388 } 389 390 if (ret == 0) 391 *lsnp = argp->prev_lsn; 392 393out: __os_free(env, argp); 394 395 return (ret); 396} 397 398/* 399 * __txn_restore_txn -- 400 * Using only during XA recovery. If we find any transactions that are 401 * prepared, but not yet committed, then we need to restore the transaction's 402 * state into the shared region, because the TM is going to issue an abort 403 * or commit and we need to respond correctly. 404 * 405 * lsnp is the LSN of the returned LSN 406 * argp is the prepare record (in an appropriate structure) 407 * 408 * PUBLIC: int __txn_restore_txn __P((ENV *, DB_LSN *, __txn_xa_regop_args *)); 409 */ 410int 411__txn_restore_txn(env, lsnp, argp) 412 ENV *env; 413 DB_LSN *lsnp; 414 __txn_xa_regop_args *argp; 415{ 416 DB_TXNMGR *mgr; 417 DB_TXNREGION *region; 418 TXN_DETAIL *td; 419 int ret; 420 421 if (argp->xid.size == 0) 422 return (0); 423 424 mgr = env->tx_handle; 425 region = mgr->reginfo.primary; 426 TXN_SYSTEM_LOCK(env); 427 428 /* Allocate a new transaction detail structure. */ 429 if ((ret = __env_alloc(&mgr->reginfo, sizeof(TXN_DETAIL), &td)) != 0) { 430 TXN_SYSTEM_UNLOCK(env); 431 return (ret); 432 } 433 434 /* Place transaction on active transaction list. */ 435 SH_TAILQ_INSERT_HEAD(®ion->active_txn, td, links, __txn_detail); 436 437 td->txnid = argp->txnp->txnid; 438 __os_id(env->dbenv, &td->pid, &td->tid); 439 td->last_lsn = *lsnp; 440 td->begin_lsn = argp->begin_lsn; 441 td->parent = INVALID_ROFF; 442 td->name = INVALID_ROFF; 443 SH_TAILQ_INIT(&td->kids); 444 MAX_LSN(td->read_lsn); 445 MAX_LSN(td->visible_lsn); 446 td->mvcc_ref = 0; 447 td->mvcc_mtx = MUTEX_INVALID; 448 td->status = TXN_PREPARED; 449 td->flags = TXN_DTL_RESTORED; 450 td->xa_status = TXN_XA_PREPARED; 451 memcpy(td->xid, argp->xid.data, argp->xid.size); 452 td->bqual = argp->bqual; 453 td->gtrid = argp->gtrid; 454 td->format = argp->formatID; 455 td->nlog_dbs = 0; 456 td->nlog_slots = TXN_NSLOTS; 457 td->log_dbs = R_OFFSET(&mgr->reginfo, td->slots); 458 459 region->stat.st_nrestores++; 460#ifdef HAVE_STATISTICS 461 region->stat.st_nactive++; 462 if (region->stat.st_nactive > region->stat.st_maxnactive) 463 region->stat.st_maxnactive = region->stat.st_nactive; 464#endif 465 TXN_SYSTEM_UNLOCK(env); 466 return (0); 467} 468 469/* 470 * __txn_recycle_recover -- 471 * Recovery function for recycle. 472 * 473 * PUBLIC: int __txn_recycle_recover 474 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 475 */ 476int 477__txn_recycle_recover(env, dbtp, lsnp, op, info) 478 ENV *env; 479 DBT *dbtp; 480 DB_LSN *lsnp; 481 db_recops op; 482 void *info; 483{ 484 __txn_recycle_args *argp; 485 int ret; 486 487#ifdef DEBUG_RECOVER 488 (void)__txn_child_print(env, dbtp, lsnp, op, info); 489#endif 490 if ((ret = __txn_recycle_read(env, dbtp->data, &argp)) != 0) 491 return (ret); 492 493 COMPQUIET(lsnp, NULL); 494 495 if ((ret = __db_txnlist_gen(env, info, 496 DB_UNDO(op) ? -1 : 1, argp->min, argp->max)) != 0) 497 return (ret); 498 499 __os_free(env, argp); 500 501 return (0); 502} 503 504/* 505 * PUBLIC: int __txn_regop_42_recover 506 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 507 * 508 * These records are only ever written for commits. Normally, we redo any 509 * committed transaction, however if we are doing recovery to a timestamp, then 510 * we may treat transactions that committed after the timestamp as aborted. 511 */ 512int 513__txn_regop_42_recover(env, dbtp, lsnp, op, info) 514 ENV *env; 515 DBT *dbtp; 516 DB_LSN *lsnp; 517 db_recops op; 518 void *info; 519{ 520 __txn_regop_42_args *argp; 521 DB_TXNHEAD *headp; 522 u_int32_t status; 523 int ret; 524 525#ifdef DEBUG_RECOVER 526 (void)__txn_regop_42_print(env, dbtp, lsnp, op, info); 527#endif 528 529 if ((ret = __txn_regop_42_read(env, dbtp->data, &argp)) != 0) 530 return (ret); 531 532 headp = info; 533 /* 534 * We are only ever called during FORWARD_ROLL or BACKWARD_ROLL. 535 * We check for the former explicitly and the last two clauses 536 * apply to the BACKWARD_ROLL case. 537 */ 538 539 if (op == DB_TXN_FORWARD_ROLL) { 540 /* 541 * If this was a 2-phase-commit transaction, then it 542 * might already have been removed from the list, and 543 * that's OK. Ignore the return code from remove. 544 */ 545 if ((ret = __db_txnlist_remove(env, 546 info, argp->txnp->txnid)) != DB_NOTFOUND && ret != 0) 547 goto err; 548 } else if ((env->dbenv->tx_timestamp != 0 && 549 argp->timestamp > (int32_t)env->dbenv->tx_timestamp) || 550 (!IS_ZERO_LSN(headp->trunc_lsn) && 551 LOG_COMPARE(&headp->trunc_lsn, lsnp) < 0)) { 552 /* 553 * We failed either the timestamp check or the trunc_lsn check, 554 * so we treat this as an abort even if it was a commit record. 555 */ 556 if ((ret = __db_txnlist_update(env, info, 557 argp->txnp->txnid, TXN_ABORT, NULL, &status, 1)) != 0) 558 goto err; 559 else if (status != TXN_IGNORE && status != TXN_OK) 560 goto err; 561 } else { 562 /* This is a normal commit; mark it appropriately. */ 563 if ((ret = __db_txnlist_update(env, 564 info, argp->txnp->txnid, argp->opcode, lsnp, 565 &status, 0)) == DB_NOTFOUND) { 566 if ((ret = __db_txnlist_add(env, 567 info, argp->txnp->txnid, 568 argp->opcode == TXN_ABORT ? 569 TXN_IGNORE : argp->opcode, lsnp)) != 0) 570 goto err; 571 } else if (ret != 0 || 572 (status != TXN_IGNORE && status != TXN_OK)) 573 goto err; 574 } 575 576 if (ret == 0) 577 *lsnp = argp->prev_lsn; 578 579 if (0) { 580err: __db_errx(env, 581 "txnid %lx commit record found, already on commit list", 582 (u_long)argp->txnp->txnid); 583 ret = EINVAL; 584 } 585 __os_free(env, argp); 586 587 return (ret); 588} 589 590/* 591 * PUBLIC: int __txn_ckp_42_recover 592 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 593 */ 594int 595__txn_ckp_42_recover(env, dbtp, lsnp, op, info) 596 ENV *env; 597 DBT *dbtp; 598 DB_LSN *lsnp; 599 db_recops op; 600 void *info; 601{ 602 __txn_ckp_42_args *argp; 603 int ret; 604 605#ifdef DEBUG_RECOVER 606 __txn_ckp_42_print(env, dbtp, lsnp, op, info); 607#endif 608 if ((ret = __txn_ckp_42_read(env, dbtp->data, &argp)) != 0) 609 return (ret); 610 611 if (op == DB_TXN_BACKWARD_ROLL) 612 __db_txnlist_ckp(env, info, lsnp); 613 614 *lsnp = argp->last_ckp; 615 __os_free(env, argp); 616 return (DB_TXN_CKP); 617} 618